chore: Runtime api for pre-allocated outputs

keehyuna · keehyuna · commit 23131c34e529 · 2024-11-25T22:55:08.000+09:00
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -99,6 +99,9 @@ TRTEngine::TRTEngine(
   exec_ctx = make_trt(cuda_engine->createExecutionContext());
   TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
 
+  runtime_states.prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
+  runtime_states.prev_pre_allocated_outputs_enabled = false;
+
   if (_in_binding_names.size() == 0 && _out_binding_names.size() == 0) {
     uint64_t inputs = 0;
     uint64_t outputs = 0;
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -30,6 +30,37 @@ using FlattenedState = std::tuple<
     std::tuple<std::string, std::string>, // serialized metadata
     std::tuple<std::string, std::string>>; // Platform
 
+struct RuntimeStates {
+  bool need_cudagraphs_record;
+  bool can_use_pre_allocated_outputs;
+};
+
+struct TorchTRTRuntimeStates {
+  // Previous runtime states
+  bool prev_cudagraphs_enabled, prev_pre_allocated_outputs_enabled;
+
+  // Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs
+  // based on the current and previous states, as well as input shape has changed
+  RuntimeStates validate_states(bool cudagraphs_enabled, bool pre_allocated_outputs_enabled, bool shape_changed) {
+    bool need_cudagraphs_record = false;
+    bool can_use_pre_allocated_outputs = false;
+
+    // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
+    if (cudagraphs_enabled && (!prev_cudagraphs_enabled || shape_changed)) {
+      need_cudagraphs_record = true;
+    }
+    // Pre-allocated output can be used when previous and current state are true without shape change
+    if (prev_pre_allocated_outputs_enabled && pre_allocated_outputs_enabled && !shape_changed) {
+      can_use_pre_allocated_outputs = true;
+    }
+    prev_cudagraphs_enabled = cudagraphs_enabled;
+    prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled;
+
+    RuntimeStates values = {need_cudagraphs_record, can_use_pre_allocated_outputs};
+    return values;
+  }
+};
+
 struct TRTEngine : torch::CustomClassHolder {
   // Each engine needs it's own runtime object
   std::shared_ptr<nvinfer1::IRuntime> rt;
@@ -89,6 +120,7 @@ struct TRTEngine : torch::CustomClassHolder {
   int64_t get_automatic_device_memory_budget();
   std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
   void set_pre_allocated_outputs(bool enable);
+  TorchTRTRuntimeStates runtime_states;
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
 
@@ -103,8 +135,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key = "None";
-  bool prev_cudagraphs_enabled = false;
-  bool use_pre_allocated_outputs = true;
+  bool use_pre_allocated_outputs = false;
   std::vector<at::Tensor> pre_allocated_outputs;
 
   // TODO: Implement a call method
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -203,10 +203,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   bool shape_changed = _validate_shapes(inputs, compiled_engine);
 
   // Whether cudagraphs needs to record the graph on this pass
-  // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
-  bool need_cudagraphs_record =
-      (((!compiled_engine->prev_cudagraphs_enabled) && CUDAGRAPHS_MODE) || (CUDAGRAPHS_MODE && shape_changed));
-  compiled_engine->prev_cudagraphs_enabled = CUDAGRAPHS_MODE;
+  RuntimeStates states = compiled_engine->runtime_states.validate_states(
+      CUDAGRAPHS_MODE, compiled_engine->use_pre_allocated_outputs, shape_changed);
+  bool need_cudagraphs_record = states.need_cudagraphs_record;
 
   if (!CUDAGRAPHS_MODE || shape_changed) {
     compiled_engine->cudagraph.reset();
@@ -289,10 +288,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       output_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
     }
-    if (!compiled_engine->use_pre_allocated_outputs || shape_changed) {
-      outputs = create_output_tensors(compiled_engine);
-    } else {
+    if (states.can_use_pre_allocated_outputs) {
       outputs = compiled_engine->pre_allocated_outputs;
+    } else {
+      outputs = create_output_tensors(compiled_engine);
     }
 
     for (auto output_indices : compiled_engine->out_binding_map) {
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -23,6 +23,40 @@
 logger = logging.getLogger(__name__)
 
 
+class TorchTRTRuntimeStates:
+    def __init__(self, cudagraphs_enabled: bool, pre_allocated_outputs_enabled: bool):
+        self.prev_cudagraphs_enabled = cudagraphs_enabled
+        self.prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled
+
+    def validate_states(
+        self,
+        cudagraphs_enabled: bool,
+        pre_allocated_outputs_enabled: bool,
+        shape_changed: bool,
+    ) -> Tuple[bool, bool]:
+        # Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs
+        # based on the current and previous states, as well as input shape has changed
+        need_cudagraphs_record = False
+        can_use_pre_allocated_outputs = False
+
+        # Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
+        if cudagraphs_enabled and (not self.prev_cudagraphs_enabled or shape_changed):
+            need_cudagraphs_record = True
+
+        # Pre-allocated output can be used when previous and current state are true without shape change
+        if (
+            self.prev_pre_allocated_outputs_enabled
+            and pre_allocated_outputs_enabled
+            and (not shape_changed)
+        ):
+            can_use_pre_allocated_outputs = True
+
+        self.prev_cudagraphs_enabled = cudagraphs_enabled
+        self.prev_pre_allocated_outputs_enabled = pre_allocated_outputs_enabled
+
+        return need_cudagraphs_record, can_use_pre_allocated_outputs
+
+
 class PythonTorchTensorRTModule(Module):  # type: ignore[misc]
     """PythonTorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine.
 
@@ -107,7 +141,9 @@ def __init__(
         self.engine = None
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
-        self.prev_cudagraphs_enabled = False
+        self.runtime_states = TorchTRTRuntimeStates(
+            torch_tensorrt.runtime.get_cudagraphs_mode(), False
+        )
         self.pre_allocated_outputs: List[torch.Tensor] = []
         self.use_pre_allocated_outputs = False
 
@@ -318,13 +354,11 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
 
             cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
             shape_changed = self.validate_input_shapes(inputs)
-            # Cudagraphs record is required if cudagraphs_enabled is toggled to True regardless of shape change
-            if not self.prev_cudagraphs_enabled and cudagraphs_enabled:
-                need_cudagraphs_record = True
-            else:
-                need_cudagraphs_record = cudagraphs_enabled and shape_changed
-
-            self.prev_cudagraphs_enabled = cudagraphs_enabled
+            need_cudagraphs_record, can_use_pre_allocated_outputs = (
+                self.runtime_states.validate_states(
+                    cudagraphs_enabled, self.use_pre_allocated_outputs, shape_changed
+                )
+            )
 
             if need_cudagraphs_record:
                 if self.cudagraph:
@@ -399,7 +433,9 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 if self.profiling_enabled
                 else nullcontext()
             ):
-                if not self.use_pre_allocated_outputs or shape_changed:
+                if can_use_pre_allocated_outputs:
+                    outputs = self.pre_allocated_outputs
+                else:
                     self.output_shapes = [
                         tuple(self.context.get_tensor_shape(output_name))
                         for output_name in self.output_names
@@ -409,8 +445,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
                     outputs = self.create_output_tensors()
-                else:
-                    outputs = self.pre_allocated_outputs
 
                 for o, output_name in enumerate(self.output_names):
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -207,7 +207,6 @@ def setup_engine(self) -> None:
         if self.engine is not None:
             return
         self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info())
-        self.set_pre_allocated_outputs(False)
 
     def encode_metadata(self, metadata: Any) -> str:
         metadata = copy.deepcopy(metadata)
diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py
@@ -8,4 +8,5 @@
     set_cudagraphs_mode,
 )
 from torch_tensorrt.runtime._multi_device_safe_mode import set_multi_device_safe_mode
+from torch_tensorrt.runtime._pre_allocated_outputs import enable_pre_allocated_outputs
 from torch_tensorrt.runtime._weight_streaming import weight_streaming
diff --git a/py/torch_tensorrt/runtime/_pre_allocated_outputs.py b/py/torch_tensorrt/runtime/_pre_allocated_outputs.py
@@ -0,0 +1,41 @@
+import logging
+from typing import Any
+
+import torch
+from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
+
+logger = logging.getLogger(__name__)
+
+
+class _PreAllocatedOutputContextManager(object):
+    """
+    Helper class used to enable pre-allocated output feature in runtime module
+    """
+
+    def __init__(self, module: torch.fx.GraphModule) -> None:
+        rt_mods = []
+        for name, rt_mod in module.named_children():
+            if "_run_on_acc" in name and isinstance(
+                rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule)
+            ):
+                rt_mods.append(rt_mod)
+        self.rt_mods = rt_mods
+
+    def set_pre_allocated_output(self, enable: bool) -> None:
+        for mod in self.rt_mods:
+            mod.set_pre_allocated_outputs(enable)
+
+    def __enter__(self) -> "_PreAllocatedOutputContextManager":
+        # Enable pre-allocated output
+        self.set_pre_allocated_output(True)
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        # Disable pre-allocated output
+        self.set_pre_allocated_output(False)
+
+
+def enable_pre_allocated_outputs(
+    module: torch.fx.GraphModule,
+) -> _PreAllocatedOutputContextManager:
+    return _PreAllocatedOutputContextManager(module)
diff --git a/tests/py/dynamo/runtime/test_pre_allocated_outputs.py b/tests/py/dynamo/runtime/test_pre_allocated_outputs.py
@@ -0,0 +1,130 @@
+import torch
+import torch_tensorrt as torchtrt
+from parameterized import parameterized
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+INPUT_SIZE = (3, 16, 16)
+TRIALS = 5
+
+
+class TestPreAllocatedOutputs(TestCase):
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_pre_allocated_outputs_default(self, _, use_python_runtime):
+        class SampleModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.softmax((x + 2) * 7, dim=0)
+
+        model = SampleModel().eval().cuda()
+        inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)]
+        fx_graph = torch.fx.symbolic_trace(model)
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = torchtrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs[0],
+            min_block_size=1,
+            pass_through_build_failures=True,
+            use_python_runtime=use_python_runtime,
+        )
+
+        ref_out_list = []
+        trt_out_list = []
+        with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
+            for i in inputs:
+                ref_out_list.append(fx_graph(i).detach().cpu())
+                trt_out_list.append(optimized_model(i).detach().cpu())
+
+        for torch_model_results, optimized_model_results in zip(
+            ref_out_list, trt_out_list
+        ):
+            torch.testing.assert_close(
+                torch_model_results,
+                optimized_model_results,
+                rtol=5e-03,
+                atol=5e-03,
+                equal_nan=True,
+                check_dtype=True,
+            )
+
+        torch._dynamo.reset()
+
+    @parameterized.expand(
+        [
+            ("python_runtime", True),
+            ("cpp_runtime", False),
+        ]
+    )
+    def test_pre_allocated_outputs_dynamic(self, _, use_python_runtime):
+        class SampleModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.relu((x + 2) * 0.5)
+
+        inputs = torchtrt.Input(
+            min_shape=(1, 3, 128, 224),
+            opt_shape=(8, 3, 192, 224),
+            max_shape=(16, 3, 224, 224),
+            dtype=torch.float,
+            name="x",
+        )
+        fx_graph = torch.fx.symbolic_trace(SampleModel())
+
+        optimized_model = torchtrt.compile(
+            fx_graph,
+            "dynamo",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+            torch_executed_ops={"torch.ops.aten.mul.Tensor"},
+            use_python_runtime=use_python_runtime,
+        )
+
+        input_list = []
+        ref_out_list = []
+        trt_out_list = []
+        # Alternating cuda_graphs enable and input shapes at every five iterations.
+        for i in [1, 3, 8, 11, 16]:
+            for j in [128, 128, 222, 222, 224]:
+                input_list.append(torch.randn((i, 3, j, 224)).cuda())
+
+        pre_allocated_output_ctx = torchtrt.runtime.enable_pre_allocated_outputs(
+            optimized_model
+        )
+        pre_allocated_output = False
+        for enable_cuda_graphs in [False, True]:
+            for i in range(len(input_list)):
+                # Toggles cuda graph at all index in TRIALS
+                if i % TRIALS == i // TRIALS:
+                    cuda_graphs = enable_cuda_graphs
+                else:
+                    cuda_graphs = not enable_cuda_graphs
+                if i % 3 == 0:
+                    pre_allocated_output = not pre_allocated_output
+
+                torchtrt.runtime.set_cudagraphs_mode(cuda_graphs)
+                pre_allocated_output_ctx.set_pre_allocated_output(pre_allocated_output)
+
+                ref_out_list.append(fx_graph(input_list[i]))
+                trt_out_list.append(optimized_model(input_list[i]))
+
+        for torch_model_results, optimized_model_results in zip(
+            ref_out_list, trt_out_list
+        ):
+            torch.testing.assert_close(
+                torch_model_results,
+                optimized_model_results,
+                rtol=5e-03,
+                atol=5e-03,
+                equal_nan=True,
+                check_dtype=True,
+            )
+        torch._dynamo.reset()
+
+
+if __name__ == "__main__":
+    run_tests()

Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,5 @@`
`8`	`8`	`set_cudagraphs_mode,`
`9`	`9`	`)`
`10`	`10`	`from torch_tensorrt.runtime._multi_device_safe_mode import set_multi_device_safe_mode`
	`11`	`+from torch_tensorrt.runtime._pre_allocated_outputs import enable_pre_allocated_outputs`
`11`	`12`	`from torch_tensorrt.runtime._weight_streaming import weight_streaming`