chore: added missing test

keehyuna · keehyuna · commit 7585fc8fdad7 · 2024-11-18T19:34:51.000+09:00
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -213,8 +213,8 @@ TRTEngine::TRTEngine(
   LOG_DEBUG(*this);
 }
 
-void TRTEngine::set_cudagraphs_enabled_parent_module(bool enable) {
-  cudagraphs_enabled_parent_module = enable;
+void TRTEngine::set_whole_cudagraphs(bool enable) {
+  whole_cudagraphs = enable;
 }
 
 TRTEngine::~TRTEngine() {
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -87,7 +87,7 @@ struct TRTEngine : torch::CustomClassHolder {
   bool set_device_memory_budget(int64_t budget);
   int64_t get_streamable_device_memory_budget();
   int64_t get_automatic_device_memory_budget();
-  void set_cudagraphs_enabled_parent_module(bool enable);
+  void set_whole_cudagraphs(bool enable);
   std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
@@ -103,8 +103,8 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key;
-  bool cudagraphs_enabled = false;
-  bool cudagraphs_enabled_parent_module = false;
+  bool prev_cudagraphs_enabled = false;
+  bool whole_cudagraphs = false;
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
 
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -113,14 +113,14 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     LOG_INFO("" << log_info);
     compiled_engine->cudagraph.enable_debug_mode();
   }
-  bool cudagraphs_enabled = (!compiled_engine->cudagraphs_enabled_parent_module && CUDAGRAPHS_MODE);
+  bool cudagraphs_enabled = (!compiled_engine->whole_cudagraphs && CUDAGRAPHS_MODE);
 
   // Whether cudagraphs needs to record the graph on this pass
   // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
-  bool need_cudagraphs_record =
-      (((!compiled_engine->cudagraphs_enabled) && cudagraphs_enabled) ||
-       (cudagraphs_enabled && (!_cudagraphs_validate_shapes(inputs, compiled_engine))));
-  compiled_engine->cudagraphs_enabled = cudagraphs_enabled;
+  bool need_cudagraphs_record = cudagraphs_enabled &&
+      ((!compiled_engine->prev_cudagraphs_enabled) || (!_cudagraphs_validate_shapes(inputs, compiled_engine)));
+
+  compiled_engine->prev_cudagraphs_enabled = cudagraphs_enabled;
 
   if (!cudagraphs_enabled) {
     compiled_engine->cudagraph.reset();
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -87,7 +87,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("dump_engine_layer_info_to_file", &TRTEngine::dump_engine_layer_info_to_file)
         .def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
-        .def("set_cudagraphs_enabled_parent_module", &TRTEngine::set_cudagraphs_enabled_parent_module)
+        .def("set_whole_cudagraphs", &TRTEngine::set_whole_cudagraphs)
         .def("infer_outputs", &TRTEngine::infer_outputs)
         .def_property(
             "device_memory_budget",
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
@@ -598,7 +598,7 @@ def save(
                 This flag is experimental for now.
     """
     if isinstance(module, WrapperTorchTensorRTModule):
-        module = module.original_module
+        module = module.compiled_module
     module_type = _parse_module_type(module)
     accepted_formats = {"exported_program", "torchscript"}
     if arg_inputs is not None and not all(
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -838,6 +838,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
     if len(dryrun_tracker.to_run_in_torch) > 0:
         # Capture/replay a series of CUDA operations in subgraphs in a wrapped runtime module.
         partitioned_module = WrapperTorchTensorRTModule(
+            gm,
             partitioned_module,
             dryrun_tracker.output_shapes,
             dryrun_tracker.output_dtypes,
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -108,8 +108,9 @@ def __init__(
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
         # Check if CUDA graph capture is enabled in the parent node
-        self.cudagraphs_enabled_parent_module = False
-        self.cudagraphs_enabled = False
+        self.whole_cudagraphs = False
+        # Previous cuda graphs state
+        self.prev_cudagraphs_enabled = False
 
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -150,8 +151,8 @@ def set_default_device_memory_budget(self) -> int:
         logger.debug(f"Weight streaming budget set to {budget_bytes}B")
         return self._set_device_memory_budget(budget_bytes)
 
-    def set_cudagraphs_enabled_parent_module(self, enable: bool) -> None:
-        self.cudagraphs_enabled_parent_module = enable
+    def set_whole_cudagraphs(self, enable: bool) -> None:
+        self.whole_cudagraphs = enable
 
     def setup_engine(self) -> None:
         assert (
@@ -254,16 +255,14 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
 
             cudagraphs_enabled = (
                 torch_tensorrt.runtime.get_cudagraphs_mode()
-                and not self.cudagraphs_enabled_parent_module
+                and not self.whole_cudagraphs
             )
             # Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
-            if not self.cudagraphs_enabled and cudagraphs_enabled:
-                need_cudagraphs_record = True
-            else:
-                need_cudagraphs_record = (
-                    cudagraphs_enabled and not self.cudagraphs_validate_shapes(inputs)
-                )
-            self.cudagraphs_enabled = cudagraphs_enabled
+            need_cudagraphs_record = cudagraphs_enabled and (
+                (not self.prev_cudagraphs_enabled)
+                or (not self.cudagraphs_validate_shapes(inputs))
+            )
+            self.prev_cudagraphs_enabled = cudagraphs_enabled
 
             if need_cudagraphs_record:
                 self._input_buffers = [None] * len(self.input_names)
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -196,8 +196,8 @@ def set_device_memory_budget(self, budget_bytes: int) -> int:
 
         return budget_bytes
 
-    def set_cudagraphs_enabled_parent_module(self, enable: bool) -> None:
-        self.engine.set_cudagraphs_enabled_parent_module(enable)
+    def set_whole_cudagraphs(self, enable: bool) -> None:
+        self.engine.set_whole_cudagraphs(enable)
 
     def setup_engine(self) -> None:
         """
diff --git a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py
@@ -7,6 +7,7 @@
 
 import torch
 import torch_tensorrt
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt.dynamo import partitioning
 from torch_tensorrt.dynamo.conversion import DYNAMIC_DIM
@@ -17,17 +18,28 @@
 
 
 class WrapperTorchTensorRTModule(torch.nn.Module):  # type: ignore[misc]
-    """This Wrapper runtime module to record/replay cuda graph in sub modules"""
+    """This Wrapper runtime module is to record/replay whole cuda graph in sub modules
+
+    Args:
+        original_module: Unmodified FX GraphModule
+        compiled_module: Complied fx graphModule that will be wrapped
+        output_shapes: Shapes of output Tensors of the graph
+        output_dtypes: Output data types of the graph
+    Returns:
+        Output tensor or tensor list
+    """
 
     def __init__(
         self,
         original_module: torch.nn.Module,
+        compiled_module: torch.nn.Module,
         output_shapes: List[torch.Size],
         output_dtypes: List[torch.dtype],
     ):
         super(WrapperTorchTensorRTModule, self).__init__()
         self.original_module = original_module
-        self.inputs = partitioning.construct_submodule_inputs(original_module)
+        self.compiled_module = compiled_module
+        self.inputs = partitioning.construct_submodule_inputs(compiled_module)
         self.output_shapes = output_shapes
         self.output_dtypes = output_dtypes
 
@@ -42,9 +54,9 @@ def __init__(
         self.input_is_dynamic = input_is_dynamic(self.inputs)
 
         # Disable cudagrphs in submodules as it will be enabled in wrapper
-        for name, rt_mod in self.original_module.named_children():
+        for name, rt_mod in self.compiled_module.named_children():
             if "_run_on_acc" in name:
-                rt_mod.set_cudagraphs_enabled_parent_module(True)
+                rt_mod.set_whole_cudagraphs(True)
 
         # Warm up is necessary to ensure that memory allocations and initializations are not recorded in cuda graphs
         with unset_fake_temporarily():
@@ -53,7 +65,7 @@ def __init__(
             s.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(s):
                 for _ in range(3):
-                    self.original_module(*inputs_tensor)
+                    self.compiled_module(*inputs_tensor)
             torch.cuda.current_stream().wait_stream(s)
 
     def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
@@ -71,7 +83,9 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
             self.shape_key = new_shape_key
 
             if self.input_is_dynamic:
-                tmp_outputs = self.original_module(*inputs)
+                with FakeTensorMode() as mode:
+                    fake_inputs = [mode.from_tensor(input) for input in inputs]
+                    tmp_outputs = self.original_module(*fake_inputs)
                 if not isinstance(tmp_outputs, (list, tuple)):
                     tmp_outputs = [tmp_outputs]
                 self.output_shapes = [tuple(output.shape) for output in tmp_outputs]
@@ -237,7 +251,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                             with torch.cuda.graph(
                                 self.cudagraph, stream=self._engine_stream
                             ):
-                                self._output_buffers = self.original_module(
+                                self._output_buffers = self.compiled_module(
                                     *self._input_buffers
                                 )
 
@@ -251,7 +265,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                         self.cudagraph.replay()  # type: ignore
 
                     else:
-                        outputs = self.original_module(*inputs)
+                        outputs = self.compiled_module(*inputs)
 
                 self._caller_stream.wait_stream(self._engine_stream)
 
diff --git a/py/torch_tensorrt/runtime/_weight_streaming.py b/py/torch_tensorrt/runtime/_weight_streaming.py
@@ -22,7 +22,7 @@ def __init__(
         self.current_device_budget = 0
 
         if isinstance(module, WrapperTorchTensorRTModule):
-            module = module.original_module
+            module = module.compiled_module
         for name, rt_mod in module.named_children():
             if "_run_on_acc" in name and isinstance(
                 rt_mod, (PythonTorchTensorRTModule, TorchTensorRTModule)
diff --git a/tests/py/dynamo/runtime/test_005_wrapper_cudagraphs.py b/tests/py/dynamo/runtime/test_005_wrapper_cudagraphs.py

Original file line number	Diff line number	Diff line change
`@@ -213,8 +213,8 @@ TRTEngine::TRTEngine(`
`213`	`213`	`LOG_DEBUG(*this);`
`214`	`214`	`}`
`215`	`215`
`216`		`-void TRTEngine::set_cudagraphs_enabled_parent_module(bool enable) {`
`217`		`- cudagraphs_enabled_parent_module = enable;`
	`216`	`+void TRTEngine::set_whole_cudagraphs(bool enable) {`
	`217`	`+ whole_cudagraphs = enable;`
`218`	`218`	`}`
`219`	`219`
`220`	`220`	`TRTEngine::~TRTEngine() {`