chore: update for TorchTensorRTModule

keehyuna · keehyuna · commit ecee5a6b6372 · 2024-11-05T11:12:45.000+09:00
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -212,6 +212,10 @@ TRTEngine::TRTEngine(
   LOG_DEBUG(*this);
 }
 
+void TRTEngine::set_cudagraphs_enabled_parent_module(bool enable) {
+  cudagraphs_enabled_parent_module = enable;
+}
+
 TRTEngine::~TRTEngine() {
   trt_engine_profiler.reset();
   exec_ctx.reset();
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -75,6 +75,7 @@ struct TRTEngine : torch::CustomClassHolder {
   bool set_device_memory_budget(int64_t budget);
   int64_t get_streamable_device_memory_budget();
   int64_t get_automatic_device_memory_budget();
+  void set_cudagraphs_enabled_parent_module(bool enable);
   friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
   static const char BINDING_DELIM = '%';
 
@@ -85,7 +86,8 @@ struct TRTEngine : torch::CustomClassHolder {
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key;
-
+  bool cudagraphs_enabled = false;
+  bool cudagraphs_enabled_parent_module = false;
   // TODO: Implement a call method
   // c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);
 
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -113,11 +113,16 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     LOG_INFO("" << log_info);
     compiled_engine->cudagraph.enable_debug_mode();
   }
+  bool cudagraphs_enabled = (!compiled_engine->cudagraphs_enabled_parent_module && CUDAGRAPHS_MODE);
 
   // Whether cudagraphs needs to record the graph on this pass
-  bool need_cudagraphs_record = (CUDAGRAPHS_MODE && (!_cudagraphs_validate_shapes(inputs, compiled_engine)));
+  // Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
+  bool need_cudagraphs_record =
+      (((!compiled_engine->cudagraphs_enabled) && cudagraphs_enabled) ||
+       (cudagraphs_enabled && (!_cudagraphs_validate_shapes(inputs, compiled_engine))));
+  compiled_engine->cudagraphs_enabled = cudagraphs_enabled;
 
-  if (!CUDAGRAPHS_MODE) {
+  if (!cudagraphs_enabled) {
     compiled_engine->cudagraph.reset();
   }
 
@@ -211,7 +216,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
             compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
             "Error while setting the tensor address for shape inputs");
 
-        if (CUDAGRAPHS_MODE) {
+        if (cudagraphs_enabled) {
           // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
           compiled_engine->input_buffers[i] = input_cpu;
         }
@@ -231,7 +236,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
         TORCHTRT_CHECK(
             compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
 
-        if (CUDAGRAPHS_MODE) {
+        if (cudagraphs_enabled) {
           // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
           compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
           TORCHTRT_CHECK(
@@ -281,7 +286,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
         compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
       }
 
-      if (CUDAGRAPHS_MODE) {
+      if (cudagraphs_enabled) {
         TORCHTRT_CHECK(
             compiled_engine->exec_ctx->setTensorAddress(
                 name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
@@ -324,7 +329,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     caller_exec_complete.record(compiled_engine->caller_stream);
     caller_exec_complete.block(compiled_engine->engine_stream);
 
-    if (!CUDAGRAPHS_MODE) {
+    if (!cudagraphs_enabled) {
       // Direct execution uses the caller buffers directly
       compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
     } else {
@@ -350,7 +355,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   trt_exec_complete.record(compiled_engine->engine_stream);
   trt_exec_complete.block(compiled_engine->caller_stream);
 
-  if (CUDAGRAPHS_MODE) {
+  if (cudagraphs_enabled) {
     // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
     for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
       outputs[o].copy_(compiled_engine->output_buffers[o], false);
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -86,6 +86,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("dump_engine_layer_info_to_file", &TRTEngine::dump_engine_layer_info_to_file)
         .def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
+        .def("set_cudagraphs_enabled_parent_module", &TRTEngine::set_cudagraphs_enabled_parent_module)
         .def_property(
             "device_memory_budget",
             &TRTEngine::get_device_memory_budget,
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -149,6 +149,9 @@ def set_default_device_memory_budget(self) -> int:
         logger.debug(f"Weight streaming budget set to {budget_bytes}B")
         return self._set_device_memory_budget(budget_bytes)
 
+    def set_cudagraphs_enabled_parent_module(self, enable: bool) -> None:
+        self.cudagraphs_enabled_parent_module = enable
+
     def setup_engine(self) -> None:
         assert (
             self.target_platform == Platform.current_platform()
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -131,6 +131,7 @@ def __init__(
         self.weight_name_map = weight_name_map
         self.serialized_engine = serialized_engine
         self.engine = None
+        self.cudagraphs_enabled_parent_module = False
 
         if serialized_engine and not self.settings.lazy_engine_init:
             self.setup_engine()
@@ -191,6 +192,9 @@ def set_device_memory_budget(self, budget_bytes: int) -> int:
 
         return budget_bytes
 
+    def set_cudagraphs_enabled_parent_module(self, enable: bool) -> None:
+        self.engine.set_cudagraphs_enabled_parent_module(enable)
+
     def setup_engine(self) -> None:
         """
         Setup engine for a module which has deferred engine setup.
diff --git a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py
@@ -41,7 +41,7 @@ def __init__(
         # Disable cudagrphs in submodules as it will be enabled in wrapper
         for name, rt_mod in self.original_module.named_children():
             if "_run_on_acc" in name:
-                rt_mod.cudagraphs_enabled_parent_module = True
+                rt_mod.set_cudagraphs_enabled_parent_module(True)
 
         # TODO: check if only torch needs warm up.
         with unset_fake_temporarily():