pytorch
diff --git a/‎py/torch_tensorrt/dynamo/_compiler.py
Lines changed: 9 additions & 0 deletions b/‎py/torch_tensorrt/dynamo/_compiler.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
Lines changed: 23 additions & 41 deletions b/‎py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
Lines changed: 23 additions & 41 deletions
@@ -34,6 +34,9 @@
     post_lowering,
     pre_export_lowering,
 )
+from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import (
+    WrapperTorchTensorRTModule,
+)
 from torch_tensorrt.dynamo.utils import (
     get_flat_args_with_check,
     parse_graph_io,
@@ -516,6 +519,12 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
 
     dryrun_stats_display(dryrun_tracker, settings.dryrun)
 
+    if len(trt_modules) > 1:
+        # Capture/replay a series of CUDA operations in subgraphs in a wrapped runtime module.
+        partitioned_module = WrapperTorchTensorRTModule(
+            partitioned_module, dryrun_tracker.output_dtypes
+        )
+
     return partitioned_module
 
 
 
@@ -5,6 +5,7 @@
 from tempfile import tempdir
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
+import nvtx
 import tensorrt as trt
 import torch
 import torch_tensorrt
@@ -78,7 +79,6 @@ def __init__(
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
         self._caller_stream: Optional[torch.cuda.Stream] = None
         self._engine_stream: Optional[torch.cuda.Stream] = None
-
         # TODO: Make the below a Dictionary {shape: cudagraph}
         self.shape_key: Optional[str] = None
 
@@ -107,6 +107,7 @@ def __init__(
         self.engine = None
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
+        self.cudagraphs_disabled = False
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -238,15 +239,10 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())
             for i in inputs
         ]
-
-        with (
-            torch.autograd.profiler.record_function("PythonTorchTensorRTModule:Forward")
-            if self.profiling_enabled
-            else nullcontext()
-        ):
+        with nvtx.annotate(f"Forward", color="red"):
             self._check_initialized()
+            cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() and not self.cudagraphs_disabled
 
-            cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
             need_cudagraphs_record = (
                 cudagraphs_enabled and not self.cudagraphs_validate_shapes(inputs)
             )
@@ -291,13 +287,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                     ]
                     logger.warning(f"Moved all input Tensors to cuda:{device_id}")
 
-            with (
-                torch.autograd.profiler.record_function(
-                    "PythonTorchTensorRTModule:ProcessInputs"
-                )
-                if self.profiling_enabled
-                else nullcontext()
-            ):
+            with nvtx.annotate(f"ProcessInputs", color="red"):
                 assert len(contiguous_inputs) == len(
                     self.input_names
                 ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
@@ -359,13 +349,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                                 This could happen if the input tensor addresses/shapes haven't been configured correctly"
                     )
 
-            with (
-                torch.autograd.profiler.record_function(
-                    "PythonTorchTensorRTModule:ProcessOutputs"
-                )
-                if self.profiling_enabled
-                else nullcontext()
-            ):
+            with nvtx.annotate(f"ProcessOutputs", color="red"):
                 # create output tensors
                 outputs: List[torch.Tensor] = []
 
@@ -397,37 +381,35 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             output_name, outputs[o].data_ptr()
                         )
 
-            with (
-                torch.autograd.profiler.record_function(
-                    "PythonTorchTensorRTModule:TensorRTRuntime"
-                )
-                if self.profiling_enabled
-                else nullcontext()
-            ):
+            with nvtx.annotate(f"TensorRTRuntime", color="red"):
                 self._caller_stream = torch.cuda.current_stream()
                 if (
                     self._engine_stream == torch.cuda.default_stream()
                     or self._engine_stream is None
                 ):
                     self._engine_stream = torch.cuda.Stream()
 
-                self._engine_stream.wait_stream(self._caller_stream)
+                with nvtx.annotate(f"wait_stream", color="green"):
+                    self._engine_stream.wait_stream(self._caller_stream)
 
                 with torch.cuda.stream(self._engine_stream):
-
                     if cudagraphs_enabled:
                         if need_cudagraphs_record:
-                            self.cudagraph = torch.cuda.CUDAGraph()
+                            with nvtx.annotate(f"CUDAGraph", color="green"):
+                                self.cudagraph = torch.cuda.CUDAGraph()
 
                             if self.profiling_enabled:
                                 self.cudagraph.enable_debug_mode()
-
-                            with torch.cuda.graph(
-                                self.cudagraph, stream=self._engine_stream
-                            ):
-                                self.context.execute_async_v3(
-                                    self._engine_stream.cuda_stream
-                                )
+                            with nvtx.annotate(f"torch.cuda.graph", color="green"):
+                                with torch.cuda.graph(
+                                    self.cudagraph, stream=self._engine_stream
+                                ):
+                                    with nvtx.annotate(
+                                        f"execute_async_v3", color="green"
+                                    ):
+                                        self.context.execute_async_v3(
+                                            self._engine_stream.cuda_stream
+                                        )
 
                             if self.profiling_enabled:
                                 import tempfile
@@ -436,8 +418,8 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                                     self.cudagraph.debug_dump(
                                         f"{tempdir}/{self.name}_cudagraph.dot"
                                     )
-
-                        self.cudagraph.replay()  # type: ignore
+                        with nvtx.annotate(f"replay", color="green"):
+                            self.cudagraph.replay()  # type: ignore
 
                     else:
                         self.context.execute_async_v3(self._engine_stream.cuda_stream)