chore: remove output buffer opt

keehyuna · keehyuna · commit 6af0886f0355 · 2024-11-05T10:27:18.000+09:00
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -519,8 +519,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
 
     dryrun_stats_display(dryrun_tracker, settings.dryrun)
 
-    # if len(trt_modules) > 1:
-    if True:
+    if len(trt_modules) > 1:
         # Capture/replay a series of CUDA operations in subgraphs in a wrapped runtime module.
         partitioned_module = WrapperTorchTensorRTModule(
             partitioned_module, dryrun_tracker.output_dtypes
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
 import logging
+from contextlib import nullcontext
 from tempfile import tempdir
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
-import nvtx
 import tensorrt as trt
 import torch
 import torch_tensorrt
@@ -107,10 +107,8 @@ def __init__(
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
         # Check if CUDA graph capture is enabled in the parent node
-        self.cudagraphs_parent_module = False
+        self.cudagraphs_enabled_parent_module = False
         self.cudagraphs_enabled = False
-        self.pre_allocated_outputs: List[torch.Tensor] = []
-        self.use_pre_allocated_outputs = False
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -236,42 +234,34 @@ def __del__(self) -> None:
         if self.cudagraph:
             self.cudagraph.reset()
 
-    def create_output_tensors(self) -> List[torch.Tensor]:
-        # create output tensors
-        outputs: List[torch.Tensor] = []
-
-        for o, _ in enumerate(self.output_names):
-            output = torch.empty(
-                size=self.output_shapes[o],
-                dtype=self.output_dtypes[o],
-                device=torch.cuda.current_device(),
-            )
-            outputs.append(output)
-        return outputs
-
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         # Ensure inputs are available in all scopes and cast symbolic integers to Tensors
         contiguous_inputs: List[torch.Tensor] = [
             (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())
             for i in inputs
         ]
-        with nvtx.annotate("Forward", color="red"):
+
+        with (
+            torch.autograd.profiler.record_function("PythonTorchTensorRTModule:Forward")
+            if self.profiling_enabled
+            else nullcontext()
+        ):
             self._check_initialized()
+
             cudagraphs_enabled = (
                 torch_tensorrt.runtime.get_cudagraphs_mode()
-                and not self.cudagraphs_parent_module
+                and not self.cudagraphs_enabled_parent_module
             )
-            shape_changed = self.validate_input_shapes(inputs)
-            # Cudagraphs record is required if cudagraphs_enabled is toggled to True regardless of shape change
+            # Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
             if not self.cudagraphs_enabled and cudagraphs_enabled:
                 need_cudagraphs_record = True
             else:
-                need_cudagraphs_record = cudagraphs_enabled and shape_changed
+                need_cudagraphs_record = (
+                    cudagraphs_enabled and not self.cudagraphs_validate_shapes(inputs)
+                )
             self.cudagraphs_enabled = cudagraphs_enabled
 
             if need_cudagraphs_record:
-                if self.cudagraph:
-                    self.cudagraph.reset()
                 self._input_buffers = [None] * len(self.input_names)
                 self._output_buffers = [None] * len(self.output_names)
 
@@ -311,7 +301,13 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                     ]
                     logger.warning(f"Moved all input Tensors to cuda:{device_id}")
 
-            with nvtx.annotate("ProcessInputs", color="red"):
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:ProcessInputs"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
                 assert len(contiguous_inputs) == len(
                     self.input_names
                 ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
@@ -364,32 +360,44 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             self.context.set_tensor_address(
                                 input_name, contiguous_inputs[i].data_ptr()
                             )
-                if shape_changed:
-                    # Check if input shapes can be inferred.
-                    uninferred_input_names = self.context.infer_shapes()
-                    if uninferred_input_names:
-                        logger.warning(
-                            f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \
-                                    This could happen if the input tensor addresses/shapes haven't been configured correctly"
-                        )
 
-            with nvtx.annotate("ProcessOutputs:1", color="red"):
-                if not self.use_pre_allocated_outputs or shape_changed:
-                    self.output_shapes = [
-                        tuple(self.context.get_tensor_shape(output_name))
-                        for output_name in self.output_names
-                    ]
-                    if DYNAMIC_DIM in self.output_shapes:
+                # Check if input shapes can be inferred.
+                uninferred_input_names = self.context.infer_shapes()
+                if uninferred_input_names:
+                    logger.warning(
+                        f"The shapes of the inputs: {uninferred_input_names} cannot be inferred and could lead to undefined behavior. \
+                                This could happen if the input tensor addresses/shapes haven't been configured correctly"
+                    )
+
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:ProcessOutputs"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
+                # create output tensors
+                outputs: List[torch.Tensor] = []
+
+                for o, output_name in enumerate(self.output_names):
+                    shape = tuple(self.context.get_tensor_shape(output_name))
+
+                    if DYNAMIC_DIM in shape:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
-                    outputs = self.create_output_tensors()
-                else:
-                    outputs = self.pre_allocated_outputs
 
-                for o, output_name in enumerate(self.output_names):
+                    output = torch.empty(
+                        size=shape,
+                        dtype=self.output_dtypes[o],
+                        device=torch.cuda.current_device(),
+                    )
+
+                    outputs.append(output)
+
                     if need_cudagraphs_record:
                         self._output_buffers[o] = outputs[o].clone()
+
                     if cudagraphs_enabled:
                         self.context.set_tensor_address(
                             output_name, self._output_buffers[o].data_ptr()
@@ -399,35 +407,37 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             output_name, outputs[o].data_ptr()
                         )
 
-            with nvtx.annotate("TensorRTRuntime", color="red"):
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:TensorRTRuntime"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
                 self._caller_stream = torch.cuda.current_stream()
                 if (
                     self._engine_stream == torch.cuda.default_stream()
                     or self._engine_stream is None
                 ):
                     self._engine_stream = torch.cuda.Stream()
 
-                with nvtx.annotate("wait_stream", color="green"):
-                    self._engine_stream.wait_stream(self._caller_stream)
+                self._engine_stream.wait_stream(self._caller_stream)
 
                 with torch.cuda.stream(self._engine_stream):
+
                     if cudagraphs_enabled:
                         if need_cudagraphs_record:
-                            with nvtx.annotate("CUDAGraph", color="green"):
-                                self.cudagraph = torch.cuda.CUDAGraph()
+                            self.cudagraph = torch.cuda.CUDAGraph()
 
                             if self.profiling_enabled:
                                 self.cudagraph.enable_debug_mode()
-                            with nvtx.annotate("torch.cuda.graph", color="green"):
-                                with torch.cuda.graph(
-                                    self.cudagraph, stream=self._engine_stream
-                                ):
-                                    with nvtx.annotate(
-                                        "execute_async_v3", color="green"
-                                    ):
-                                        self.context.execute_async_v3(
-                                            self._engine_stream.cuda_stream
-                                        )
+
+                            with torch.cuda.graph(
+                                self.cudagraph, stream=self._engine_stream
+                            ):
+                                self.context.execute_async_v3(
+                                    self._engine_stream.cuda_stream
+                                )
 
                             if self.profiling_enabled:
                                 import tempfile
@@ -436,18 +446,14 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                                     self.cudagraph.debug_dump(
                                         f"{tempdir}/{self.name}_cudagraph.dot"
                                     )
-                        with nvtx.annotate("replay", color="green"):
-                            self.cudagraph.replay()  # type: ignore
+
+                        self.cudagraph.replay()  # type: ignore
 
                     else:
                         self.context.execute_async_v3(self._engine_stream.cuda_stream)
 
                 self._caller_stream.wait_stream(self._engine_stream)
 
-            if self.use_pre_allocated_outputs:
-                with nvtx.annotate("ProcessOutputs:2", color="red"):
-                    self.pre_allocated_outputs = self.create_output_tensors()
-
             if cudagraphs_enabled:
                 for idx, o in enumerate(outputs):
                     o.copy_(self._output_buffers[idx])
@@ -489,9 +495,10 @@ def get_layer_info(self) -> str:
         )
         return engine_json
 
-    def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
+    def cudagraphs_validate_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         """
-        Validates the input shapes of the forward function has changed
+        Validates the input shapes of the forward function
+        versus the version currently active for the
         """
         # Representation of input shapes to a given model
         # Shapes are concatenated as so:
@@ -501,8 +508,10 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         # If the new shape key differs from the existing one,
         # invalidate the old shape key and remove the CUDAGraph
         if new_shape_key != self.shape_key:
-            logger.debug(f"Input shape changed {self.shape_key} -> {new_shape_key}")
+            logger.debug(f"Resetting Cudagraph on new shape key {new_shape_key}")
             self.shape_key = new_shape_key
-            return True
+            if self.cudagraph:
+                self.cudagraph.reset()
+            return False
 
-        return False
+        return True
diff --git a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py
@@ -41,7 +41,7 @@ def __init__(
         # Disable cudagrphs in submodules as it will be enabled in wrapper
         for name, rt_mod in self.original_module.named_children():
             if "_run_on_acc" in name:
-                rt_mod.cudagraphs_parent_module = True
+                rt_mod.cudagraphs_enabled_parent_module = True
 
         # TODO: check if only torch needs warm up.
         with unset_fake_temporarily():