chore: rebase

keehyuna · keehyuna · commit 6ef68e75bc75 · 2024-12-18T00:26:40.000Z
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -130,8 +130,10 @@ struct TRTEngine : torch::CustomClassHolder {
   at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
-  std::string shape_key;
-  bool prev_cudagraphs_enabled = false;
+  std::string shape_key = "None";
+  bool use_pre_allocated_outputs = false;
+  std::vector<at::Tensor> pre_allocated_outputs;
+
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
 
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -94,6 +94,7 @@ bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngi
 void setup_input_tensors(
     std::vector<at::Tensor> inputs,
     c10::intrusive_ptr<TRTEngine> compiled_engine,
+    bool cudagraphs_enabled,
     bool need_cudagraphs_record) {
   // this is a buffer to store shape tensor input addresses throughout the runtime scope
   std::list<std::vector<int64_t>> inputShapeTensorValues;
@@ -127,7 +128,7 @@ void setup_input_tensors(
           compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
           "Error while setting the tensor address for shape inputs");
 
-      if (CUDAGRAPHS_MODE) {
+      if (cudagraphs_enabled) {
         // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
         compiled_engine->input_buffers[i] = input_cpu;
       }
@@ -147,7 +148,7 @@ void setup_input_tensors(
       TORCHTRT_CHECK(
           compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
 
-      if (CUDAGRAPHS_MODE) {
+      if (cudagraphs_enabled) {
         // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
         compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
         TORCHTRT_CHECK(
@@ -202,15 +203,16 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     compiled_engine->cudagraph.enable_debug_mode();
   }
   bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
+  bool shape_changed = _validate_shapes(inputs, compiled_engine);
 
   // Whether cudagraphs needs to record the graph on this pass
-  // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
-  bool need_cudagraphs_record = cudagraphs_enabled &&
-      ((!compiled_engine->prev_cudagraphs_enabled) || (!_cudagraphs_validate_shapes(inputs, compiled_engine)));
+  auto result = compiled_engine->runtime_states.set_runtime_states(
+      cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed);
 
-  compiled_engine->prev_cudagraphs_enabled = cudagraphs_enabled;
+  bool need_cudagraphs_record = std::get<0>(result);
+  bool can_use_pre_allocated_outputs = std::get<1>(result);
 
-  if (!cudagraphs_enabled) {
+  if (!cudagraphs_enabled || shape_changed) {
     compiled_engine->cudagraph.reset();
   }
 
@@ -272,69 +274,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
     }
 
-    for (size_t i = 0; i < inputs.size(); i++) {
-      std::string name = compiled_engine->in_binding_names[i];
-
-      TORCHTRT_CHECK(
-          inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
-
-      auto expected_type =
-          util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
-      TORCHTRT_CHECK(
-          inputs[i].dtype() == expected_type,
-          "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
-
-      auto dims = core::util::toDims(inputs[i].sizes());
-      auto shape = core::util::toVec(dims);
-      LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
-
-      if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
-        // Shape tensor inputs are casted to int64 explicitly.
-        // Refer to
-        // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
-        auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
-        std::vector<int64_t> inputs_cpu_vec(
-            input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
-        inputShapeTensorValues.emplace_back(inputs_cpu_vec);
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
-            "Error while setting the tensor address for shape inputs");
-
-        if (cudagraphs_enabled) {
-          // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
-          compiled_engine->input_buffers[i] = input_cpu;
-        }
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
-            "Error while setting the tensor address for shape inputs");
-
-      } else {
-        at::Tensor contig_input = inputs[i].view(shape).contiguous();
-        formatted_inputs.emplace_back(std::move(contig_input));
-
-        if (need_cudagraphs_record) {
-          // Create a new persistent input buffer
-          compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
-        }
-
-        TORCHTRT_CHECK(
-            compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
-
-        if (cudagraphs_enabled) {
-          // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
-          compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
-          TORCHTRT_CHECK(
-              compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
-              "Error while setting the input tensor address for inputs");
-        } else {
-          // Otherwise use the formatted buffer directly
-          TORCHTRT_CHECK(
-              compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
-              "Error while setting the input tensor address for inputs");
-        }
-      }
-    }
-
+    setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
     // Check if input shapes can be inferred.
     int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
     std::vector<char const*> names(io_size);
diff --git a/docsrc/index.rst b/docsrc/index.rst
@@ -67,6 +67,7 @@ Tutorials
 * :ref:`custom_kernel_plugins`
 * :ref:`mutable_torchtrt_module_example`
 * :ref:`weight_streaming_example`
+* :ref:`pre_allocated_output_example`
 
 .. toctree::
    :caption: Tutorials
@@ -85,6 +86,7 @@ Tutorials
    tutorials/_rendered_examples/dynamo/auto_generate_converters
    tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
    tutorials/_rendered_examples/dynamo/weight_streaming_example
+   tutorials/_rendered_examples/dynamo/pre_allocated_output_example
 
 Dynamo Frontend
 ----------------
diff --git a/examples/dynamo/torch_export_cudagraphs.py b/examples/dynamo/torch_export_cudagraphs.py
@@ -47,7 +47,7 @@
 
 # We can enable the cudagraphs API with a context manager
 with torch_tensorrt.runtime.enable_cudagraphs(opt) as cudagraphs_module:
-    out_trt = opt(inputs)
+    out_trt = cudagraphs_module(inputs)
 
 # Alternatively, we can set the cudagraphs mode for the session
 torch_tensorrt.runtime.set_cudagraphs_mode(True)
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -30,7 +30,7 @@ def __init__(self, new_cudagraphs: bool, new_pre_allocated_output: bool):
         # Indicates whether pre-allocated output was enabled in the previous execute_engine
         self.old_pre_allocated_outputs = new_pre_allocated_output
 
-    def validate_states(
+    def set_runtime_states(
         self,
         new_cudagraphs: bool,
         new_pre_allocated_output: bool,
@@ -144,8 +144,11 @@ def __init__(
         self.engine = None
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
-        # Previous cuda graphs state
-        self.prev_cudagraphs_enabled = False
+        self.runtime_states = TorchTRTRuntimeStates(
+            torch_tensorrt.runtime.get_cudagraphs_mode(), False
+        )
+        self.pre_allocated_outputs: List[torch.Tensor] = []
+        self.use_pre_allocated_outputs = False
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -352,14 +355,16 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             self._check_initialized()
 
             cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
-            shape_changed = self.cudagraphs_validate_shapes(inputs)
-            # Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
-            need_cudagraphs_record = cudagraphs_enabled and (
-                (not self.prev_cudagraphs_enabled) or (not shape_changed)
+            shape_changed = self.validate_input_shapes(inputs)
+            need_cudagraphs_record, can_use_pre_allocated_outputs = (
+                self.runtime_states.set_runtime_states(
+                    cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed
+                )
             )
-            self.prev_cudagraphs_enabled = cudagraphs_enabled
 
             if need_cudagraphs_record:
+                if self.cudagraph:
+                    self.cudagraph.reset()
                 self._input_buffers = [None] * len(self.input_names)
                 self._output_buffers = [None] * len(self.output_names)
 
@@ -423,8 +428,16 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                                     This could happen if the input tensor addresses/shapes haven't been configured correctly"
                         )
 
-            with nvtx.annotate("ProcessOutputs:1", color="red"):
-                if not self.use_pre_allocated_outputs or shape_changed:
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:ProcessOutputs"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
+                if can_use_pre_allocated_outputs:
+                    outputs = self.pre_allocated_outputs
+                else:
                     self.output_shapes = [
                         tuple(self.context.get_tensor_shape(output_name))
                         for output_name in self.output_names
@@ -434,12 +447,12 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
                     outputs = self.create_output_tensors()
-                else:
-                    outputs = self.pre_allocated_outputs
 
                 for o, output_name in enumerate(self.output_names):
+
                     if need_cudagraphs_record:
                         self._output_buffers[o] = outputs[o].clone()
+
                     if cudagraphs_enabled:
                         self.context.set_tensor_address(
                             output_name, self._output_buffers[o].data_ptr()
@@ -449,7 +462,13 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             output_name, outputs[o].data_ptr()
                         )
 
-            with nvtx.annotate("TensorRTRuntime", color="red"):
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:TensorRTRuntime"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
                 self._caller_stream = torch.cuda.current_stream()
                 if (
                     self._engine_stream == torch.cuda.default_stream()
@@ -490,6 +509,9 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
 
                 self._caller_stream.wait_stream(self._engine_stream)
 
+            if self.use_pre_allocated_outputs:
+                self.pre_allocated_outputs = self.create_output_tensors()
+
             if cudagraphs_enabled:
                 for idx, o in enumerate(outputs):
                     o.copy_(self._output_buffers[idx])
@@ -531,10 +553,9 @@ def get_layer_info(self) -> str:
         )
         return engine_json
 
-    def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
+    def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         """
-        Validates the input shapes of the forward function
-        versus the version currently active for the
+        Validates the input shapes of the forward function has changed
         """
         # Representation of input shapes to a given model
         # Shapes are concatenated as so:
@@ -544,10 +565,8 @@ def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         # If the new shape key differs from the existing one,
         # invalidate the old shape key and remove the CUDAGraph
         if new_shape_key != self.shape_key:
-            logger.debug(f"Resetting Cudagraph on new shape key {new_shape_key}")
+            logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}")
             self.shape_key = new_shape_key
-            if self.cudagraph:
-                self.cudagraph.reset()
-            return False
+            return True
 
-        return True
+        return False