chore: infer output shape from compiled module

keehyuna · keehyuna · commit 7e22f61aabb8 · 2024-11-21T16:16:18.000+09:00
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -838,7 +838,6 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
     if len(dryrun_tracker.to_run_in_torch) > 0:
         # Capture/replay a series of CUDA operations in subgraphs in a wrapped runtime module.
         partitioned_module = WrapperTorchTensorRTModule(
-            gm,
             partitioned_module,
             dryrun_tracker.output_shapes,
             dryrun_tracker.output_dtypes,
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -152,6 +152,11 @@ def set_default_device_memory_budget(self) -> int:
         return self._set_device_memory_budget(budget_bytes)
 
     def set_whole_cudagraphs(self, enable: bool) -> None:
+        """
+        When the global CUDA graphs mode is enabled, the parent wrapper module handles all
+        CUDA graph recording and replay. Therefore, any child modules must disable their
+        own CUDA graph functionality to avoid conflicts.
+        """
         self.whole_cudagraphs = enable
 
     def setup_engine(self) -> None:
@@ -245,7 +250,8 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())
             for i in inputs
         ]
-
+        # TODO: calculate output shape under fakeTensorMode
+        # fake_mode = detect_fake_mode(*inputs)
         with (
             torch.autograd.profiler.record_function("PythonTorchTensorRTModule:Forward")
             if self.profiling_enabled
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -131,7 +131,6 @@ def __init__(
         self.weight_name_map = weight_name_map
         self.serialized_engine = serialized_engine
         self.engine = None
-        self.cudagraphs_enabled_parent_module = False
 
         if (
             serialized_engine
@@ -197,6 +196,11 @@ def set_device_memory_budget(self, budget_bytes: int) -> int:
         return budget_bytes
 
     def set_whole_cudagraphs(self, enable: bool) -> None:
+        """
+        When the global CUDA graphs mode is enabled, the parent wrapper module handles all
+        CUDA graph recording and replay. Therefore, any child modules must disable their
+        own CUDA graph functionality to avoid conflicts.
+        """
         self.engine.set_whole_cudagraphs(enable)
 
     def setup_engine(self) -> None:
diff --git a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py
@@ -31,13 +31,11 @@ class WrapperTorchTensorRTModule(torch.nn.Module):  # type: ignore[misc]
 
     def __init__(
         self,
-        original_module: torch.nn.Module,
         compiled_module: torch.nn.Module,
         output_shapes: List[torch.Size],
         output_dtypes: List[torch.dtype],
     ):
         super(WrapperTorchTensorRTModule, self).__init__()
-        self.original_module = original_module
         self.compiled_module = compiled_module
         self.inputs = partitioning.construct_submodule_inputs(compiled_module)
         self.output_shapes = output_shapes
@@ -48,7 +46,7 @@ def __init__(
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
         self.shape_key: Optional[str] = None
         self.profiling_enabled = False
-        self.cudagraphs_enabled = False
+        self.prev_cudagraphs_enabled = False
         self._caller_stream: Optional[torch.cuda.Stream] = None
         self._engine_stream: Optional[torch.cuda.Stream] = None
         self.input_is_dynamic = input_is_dynamic(self.inputs)
@@ -57,20 +55,27 @@ def __init__(
         for name, rt_mod in self.compiled_module.named_children():
             if "_run_on_acc" in name:
                 rt_mod.set_whole_cudagraphs(True)
+        self.warm_up()
 
-        # Warm up is necessary to ensure that memory allocations and initializations are not recorded in cuda graphs
-        with unset_fake_temporarily():
-            inputs_tensor = [spec.torch_tensor.cuda() for spec in self.inputs]
-            s = torch.cuda.Stream()
-            s.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(s):
-                for _ in range(3):
-                    self.compiled_module(*inputs_tensor)
-            torch.cuda.current_stream().wait_stream(s)
+    def warm_up(self) -> None:
+        """
+        Warm up is necessary to ensure that memory allocations and initializations
+        are not recorded in cuda graphs
+        """
+        with torch_tensorrt.logging.errors():
+            with unset_fake_temporarily():
+                inputs_tensor = [spec.torch_tensor.cuda() for spec in self.inputs]
+                s = torch.cuda.Stream()
+                s.wait_stream(torch.cuda.current_stream())
+                with torch.cuda.stream(s):
+                    for _ in range(3):
+                        self.compiled_module(*inputs_tensor)
+                torch.cuda.current_stream().wait_stream(s)
 
     def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         """
         Validates the input shapes of the forward function has changed
+        And infer output shapes if dynamic input shape has changed.
         """
         # Representation of input shapes to a given model
         # Shapes are concatenated as so:
@@ -83,13 +88,12 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
             self.shape_key = new_shape_key
 
             if self.input_is_dynamic:
-                with FakeTensorMode() as mode:
-                    fake_inputs = [mode.from_tensor(input) for input in inputs]
-                    tmp_outputs = self.original_module(*fake_inputs)
+                with FakeTensorMode(allow_non_fake_inputs=True):
+                    tmp_outputs = self.compiled_module(*inputs)
                 if not isinstance(tmp_outputs, (list, tuple)):
                     tmp_outputs = [tmp_outputs]
                 self.output_shapes = [tuple(output.shape) for output in tmp_outputs]
-
+                print("self.output_shapes ", self.output_shapes)
             return True
 
         return False
@@ -114,11 +118,10 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
             shape_changed = self.validate_input_shapes(inputs)
             cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
             # Cudagraphs record is required if cudagraphs_enabled is toggled to True regardless of shape change
-            if not self.cudagraphs_enabled and cudagraphs_enabled:
-                need_cudagraphs_record = True
-            else:
-                need_cudagraphs_record = cudagraphs_enabled and shape_changed
-            self.cudagraphs_enabled = cudagraphs_enabled
+            need_cudagraphs_record = cudagraphs_enabled and (
+                (not self.prev_cudagraphs_enabled) or shape_changed
+            )
+            self.prev_cudagraphs_enabled = cudagraphs_enabled
 
             if need_cudagraphs_record:
                 if self.cudagraph:
@@ -282,4 +285,5 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
 
                 return outputs
             else:
+
                 return outputs
diff --git a/py/torch_tensorrt/dynamo/runtime/register_fake_class.py b/py/torch_tensorrt/dynamo/runtime/register_fake_class.py
@@ -3,6 +3,7 @@
 from typing import Any, List
 
 import torch
+from torch._library.fake_class_registry import FakeScriptObject
 from torch_tensorrt.dynamo.utils import input_is_dynamic, unwrap_tensor_shape
 
 
@@ -26,7 +27,12 @@ def fake_tensorrt_execute_engine(
         modes = ["opt"]
 
     # Get the TRTEngine class and infer output shapes based on input shapes
-    trt_engine = fake_trt_engine.wrapped_obj.engine
+    # If fake_trt_engine is not FakeScriptObject, assumes that it is the real object
+    if isinstance(fake_trt_engine, FakeScriptObject):
+        trt_engine = fake_trt_engine.wrapped_obj.engine
+    else:
+        trt_engine = fake_trt_engine
+
     outputs_mode_dict = defaultdict(list)
     for mode in modes:
         input_shapes = [unwrap_tensor_shape(input, mode=mode) for input in inputs]
@@ -125,5 +131,8 @@ def automatic_device_memory_budget_getter(self) -> Any:
     def infer_outputs(self, input_shapes: List[Any]) -> Any:
         pass
 
+    def set_whole_cudagraphs(self) -> Any:
+        pass
+
     def __setstate__(self, serialized_state: List[str]) -> Any:
         pass