core: option to use persistent output buffer

keehyuna · keehyuna · commit e1e3debca96b · 2024-10-30T21:37:41.000+09:00
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import logging
-from contextlib import nullcontext
 from tempfile import tempdir
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
@@ -107,7 +106,10 @@ def __init__(
         self.engine = None
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
-        self.cudagraphs_disabled = False
+        # Check if CUDA graph capture is enabled in the parent node
+        self.cudagraphs_parent_module = False
+        self.cudagraphs_enabled = False
+        self.persistent_output_buffer = False
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -239,23 +241,31 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())
             for i in inputs
         ]
-        with nvtx.annotate(f"Forward", color="red"):
+        with nvtx.annotate("Forward", color="red"):
             self._check_initialized()
-            cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() and not self.cudagraphs_disabled
-
-            need_cudagraphs_record = (
-                cudagraphs_enabled and not self.cudagraphs_validate_shapes(inputs)
+            cudagraphs_enabled = (
+                torch_tensorrt.runtime.get_cudagraphs_mode()
+                and not self.cudagraphs_parent_module
             )
+            shape_changed = self.validate_input_shapes(inputs)
+            # Cudagraphs record is required if cudagraphs_enabled is toggled to True regardless of shape change
+            if not self.cudagraphs_enabled and cudagraphs_enabled:
+                need_cudagraphs_record = True
+            else:
+                need_cudagraphs_record = cudagraphs_enabled and shape_changed
+            self.cudagraphs_enabled = cudagraphs_enabled
 
             if need_cudagraphs_record:
+                if self.cudagraph:
+                    self.cudagraph.reset()
                 self._input_buffers = [None] * len(self.input_names)
                 self._output_buffers = [None] * len(self.output_names)
 
             if not cudagraphs_enabled and self.cudagraph:
                 self.cudagraph.reset()
                 self.cudagraph = None
 
-            # If in safe mode, check at each iteration for for whether a switch is required
+            # If in safe mode, check at each iteration for whether a switch is required
             if (
                 torch_tensorrt.runtime._multi_device_safe_mode._PY_RT_MULTI_DEVICE_SAFE_MODE
             ):
@@ -287,7 +297,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                     ]
                     logger.warning(f"Moved all input Tensors to cuda:{device_id}")
 
-            with nvtx.annotate(f"ProcessInputs", color="red"):
+            with nvtx.annotate("ProcessInputs", color="red"):
                 assert len(contiguous_inputs) == len(
                     self.input_names
                 ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
@@ -349,63 +359,66 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                                 This could happen if the input tensor addresses/shapes haven't been configured correctly"
                     )
 
-            with nvtx.annotate(f"ProcessOutputs", color="red"):
+            with nvtx.annotate("ProcessOutputs", color="red"):
                 # create output tensors
                 outputs: List[torch.Tensor] = []
+                if not self.persistent_output_buffer or shape_changed:
+                    # Create and keep persistent output buffer as long as its shape does not change
+                    self._output_buffers = []
+                    for o, output_name in enumerate(self.output_names):
+                        shape = tuple(self.context.get_tensor_shape(output_name))
+                        if DYNAMIC_DIM in shape:
+                            raise ValueError(
+                                "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
+                            )
 
-                for o, output_name in enumerate(self.output_names):
-                    shape = tuple(self.context.get_tensor_shape(output_name))
-
-                    if DYNAMIC_DIM in shape:
-                        raise ValueError(
-                            "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
+                        output = torch.empty(
+                            size=shape,
+                            dtype=self.output_dtypes[o].to(torch.dtype),
+                            device=torch.cuda.current_device(),
                         )
-
-                    output = torch.empty(
-                        size=shape,
-                        dtype=self.output_dtypes[o].to(torch.dtype),
-                        device=torch.cuda.current_device(),
-                    )
-
-                    outputs.append(output)
-
-                    if need_cudagraphs_record:
-                        self._output_buffers[o] = outputs[o].clone()
-
-                    if cudagraphs_enabled:
-                        self.context.set_tensor_address(
-                            output_name, self._output_buffers[o].data_ptr()
-                        )
-                    else:
-                        self.context.set_tensor_address(
-                            output_name, outputs[o].data_ptr()
-                        )
-
-            with nvtx.annotate(f"TensorRTRuntime", color="red"):
+                        if self.persistent_output_buffer:
+                            self.context.set_tensor_address(
+                                output_name, output.data_ptr()
+                            )
+                            self._output_buffers.append(output)
+                        else:
+                            outputs.append(output)
+                            if need_cudagraphs_record:
+                                self._output_buffers[o] = outputs[o].clone()
+                            if cudagraphs_enabled:
+                                self.context.set_tensor_address(
+                                    output_name, self._output_buffers[o].data_ptr()
+                                )
+                            else:
+                                self.context.set_tensor_address(
+                                    output_name, outputs[o].data_ptr()
+                                )
+            with nvtx.annotate("TensorRTRuntime", color="red"):
                 self._caller_stream = torch.cuda.current_stream()
                 if (
                     self._engine_stream == torch.cuda.default_stream()
                     or self._engine_stream is None
                 ):
                     self._engine_stream = torch.cuda.Stream()
 
-                with nvtx.annotate(f"wait_stream", color="green"):
+                with nvtx.annotate("wait_stream", color="green"):
                     self._engine_stream.wait_stream(self._caller_stream)
 
                 with torch.cuda.stream(self._engine_stream):
                     if cudagraphs_enabled:
                         if need_cudagraphs_record:
-                            with nvtx.annotate(f"CUDAGraph", color="green"):
+                            with nvtx.annotate("CUDAGraph", color="green"):
                                 self.cudagraph = torch.cuda.CUDAGraph()
 
                             if self.profiling_enabled:
                                 self.cudagraph.enable_debug_mode()
-                            with nvtx.annotate(f"torch.cuda.graph", color="green"):
+                            with nvtx.annotate("torch.cuda.graph", color="green"):
                                 with torch.cuda.graph(
                                     self.cudagraph, stream=self._engine_stream
                                 ):
                                     with nvtx.annotate(
-                                        f"execute_async_v3", color="green"
+                                        "execute_async_v3", color="green"
                                     ):
                                         self.context.execute_async_v3(
                                             self._engine_stream.cuda_stream
@@ -418,17 +431,23 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                                     self.cudagraph.debug_dump(
                                         f"{tempdir}/{self.name}_cudagraph.dot"
                                     )
-                        with nvtx.annotate(f"replay", color="green"):
+                        with nvtx.annotate("replay", color="green"):
                             self.cudagraph.replay()  # type: ignore
 
                     else:
                         self.context.execute_async_v3(self._engine_stream.cuda_stream)
 
                 self._caller_stream.wait_stream(self._engine_stream)
 
-            if cudagraphs_enabled:
-                for idx, o in enumerate(outputs):
-                    o.copy_(self._output_buffers[idx])
+            if self.persistent_output_buffer:
+                if len(self._output_buffers) == 1:
+                    return self._output_buffers[0]
+
+                return self._output_buffers
+            else:
+                if cudagraphs_enabled:
+                    for idx, o in enumerate(outputs):
+                        o.copy_(self._output_buffers[idx])
 
             if len(outputs) == 1:
                 return outputs[0]
@@ -467,10 +486,9 @@ def get_layer_info(self) -> str:
         )
         return engine_json
 
-    def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
+    def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         """
-        Validates the input shapes of the forward function
-        versus the version currently active for the
+        Validates the input shapes of the forward function has changed
         """
         # Representation of input shapes to a given model
         # Shapes are concatenated as so:
@@ -480,10 +498,8 @@ def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         # If the new shape key differs from the existing one,
         # invalidate the old shape key and remove the CUDAGraph
         if new_shape_key != self.shape_key:
-            logger.debug(f"Resetting Cudagraph on new shape key {new_shape_key}")
+            logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}")
             self.shape_key = new_shape_key
-            if self.cudagraph:
-                self.cudagraph.reset()
-            return False
+            return True
 
-        return True
+        return False
diff --git a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py
@@ -41,7 +41,7 @@ def __init__(
         # Disable cudagrphs in submodules as it will be enabled in wrapper
         for name, rt_mod in self.original_module.named_children():
             if "_run_on_acc" in name:
-                rt_mod.cudagraphs_disabled = True
+                rt_mod.cudagraphs_parent_module = True
 
         # TODO: check if only torch needs warm up.
         with unset_fake_temporarily():