check output shape to implicitly decide whether network is dds

zewenli98 · zewenli98 · commit 420af2319b87 · 2025-02-13T13:32:57.000-08:00
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -33,7 +33,9 @@
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     DYNAMO_CONVERTERS as CONVERTERS,
 )
-from torch_tensorrt.dynamo.conversion._ConverterRegistry import CallingConvention
+from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
+    CallingConvention,
+)
 from torch_tensorrt.dynamo.conversion._TRTBuilderMonitor import TRTBulderMonitor
 from torch_tensorrt.dynamo.conversion.converter_utils import (
     get_node_io,
@@ -62,6 +64,7 @@ class TRTInterpreterResult(NamedTuple):
     input_names: Sequence[str]
     output_names: Sequence[str]
     weight_name_map: Optional[dict[Any, Any]]
+    engine_is_dds: bool
 
 
 class TRTInterpreter(torch.fx.Interpreter):  # type: ignore[misc]
@@ -136,6 +139,9 @@ def __init__(
         # Engine cache for storing and reusing TRT engines
         self.engine_cache = engine_cache
 
+        # Whether the engine is data-dependent shape (dds)
+        self.engine_is_dds: bool = False
+
     def validate_conversion(self) -> Set[str]:
         missing_converters: Set[str] = set()
 
@@ -575,6 +581,7 @@ def _insert_engine_to_cache(self, hash_val: str, serialized_engine: bytes) -> No
                 self.input_specs,
                 self.compilation_settings,
                 self.weight_name_map,
+                self.engine_is_dds,
             ),
         )
 
@@ -589,6 +596,7 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
                 cached_engine_input_specs,
                 engine_compilation_settings,
                 self.weight_name_map,
+                self.engine_is_dds,
             ) = cached_data
 
             setting_compatiblity, incompattible_settings = settings_are_compatible(
@@ -650,9 +658,20 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
                 self._input_names,
                 self._output_names,
                 self.weight_name_map,
+                self.engine_is_dds,
             )
         return None
 
+    def check_dds(self, serialized_engine: bytes, output_names: List[str]) -> bool:
+        runtime = trt.Runtime(TRT_LOGGER)
+        engine = runtime.deserialize_cuda_engine(serialized_engine)
+
+        for output_name in output_names:
+            output_shape = engine.get_tensor_shape(output_name)
+            if -1 in output_shape:
+                return True
+        return False
+
     def run(
         self,
         strict_type_constraints: bool = False,
@@ -709,6 +728,8 @@ def run(
         )
         assert serialized_engine
 
+        self.engine_is_dds = self.check_dds(serialized_engine, self._output_names)
+
         _LOGGER.info(
             f"Build TRT engine elapsed time: {datetime.now() - build_engine_start_time}"
         )
@@ -735,6 +756,7 @@ def run(
             self._input_names,
             self._output_names,
             self.weight_name_map,
+            self.engine_is_dds,
         )
 
     def run_node(self, n: torch.fx.Node) -> torch.fx.Node:
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -30,7 +30,7 @@ def infer_module_output_dtypes(
     """
     outputs = [node for node in module.graph.nodes if node.op == "output"]
     outputs = outputs[0].args
-    return get_output_dtypes(outputs, truncate_double)
+    return get_output_dtypes(outputs, truncate_double)  # type: ignore[no-any-return]
 
 
 def interpret_module_to_result(
@@ -112,4 +112,5 @@ def convert_module(
         name=name,
         settings=settings,
         weight_name_map=interpreter_result.weight_name_map,
+        engine_is_dds=interpreter_result.engine_is_dds,
     )
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -127,6 +127,7 @@ def __init__(
         name: str = "",
         settings: CompilationSettings = CompilationSettings(),
         weight_name_map: Optional[dict[Any, Any]] = None,
+        engine_is_dds: bool = False,
     ):
         """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
         a PyTorch ``torch.nn.Module`` around it. Uses TensorRT Python APIs to run the engine
@@ -140,6 +141,7 @@ def __init__(
             name (str): Name for module
             settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed
             weight_name_map (dict): Mapping of engine weight name to state_dict weight name
+            engine_is_dds (bool): Whether the engine is Data Dependent Shape
 
         Example:
 
@@ -200,7 +202,7 @@ def __init__(
             torch_tensorrt.runtime.get_cudagraphs_mode()
         )
 
-        self.contains_dds_layer = False
+        self.engine_is_dds = engine_is_dds
         self.pre_allocated_outputs: List[torch.Tensor] = []
         self.use_pre_allocated_outputs = False
         self.output_allocator: Optional[DynamicOutputAllocator] = None
@@ -276,19 +278,12 @@ def setup_engine(self) -> None:
             for output_name in self.output_names
         ]
 
-        self.contains_dds_layer = self._check_dds_layer()
-        if self.contains_dds_layer:
-            self.setup_output_allocator()
+        if self.engine_is_dds:
+            self.create_output_allocator()
 
         if torch_tensorrt.runtime.get_cudagraphs_mode():
             self.cudagraph = torch.cuda.CUDAGraph()
 
-    def _check_dds_layer(self) -> bool:
-        layer_info = self.get_layer_info()
-        if "trainStation" in layer_info:  # contains dds layer
-            return True
-        return False
-
     def _check_initialized(self) -> None:
         if not self.initialized:
             raise RuntimeError("PythonTorchTensorRTModule is not initialized.")
@@ -406,19 +401,13 @@ def create_output_tensors(self) -> List[torch.Tensor]:
     def set_pre_allocated_outputs(self, enable: bool) -> None:
         self.use_pre_allocated_outputs = enable
 
-    def setup_output_allocator(self) -> None:
+    def create_output_allocator(self) -> None:
         if self.output_allocator is None:
             output_dtypes_dict = {}
             for o, output_name in enumerate(self.output_names):
                 output_dtypes_dict[output_name] = self.output_dtypes[o]
             self.output_allocator = DynamicOutputAllocator(output_dtypes_dict)
 
-        for output_name in self.output_names:
-            if not self.context.set_output_allocator(
-                output_name, self.output_allocator
-            ):
-                raise RuntimeError(f"Failed to set output allocator for {output_name}")
-
     def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
 
         def run_cuda_graph() -> torch.Tensor | Tuple[torch.Tensor, ...]:
@@ -569,6 +558,23 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
 
                 self.setup_input_tensors(contiguous_inputs, False, False)
 
+            with (
+                torch.autograd.profiler.record_function(
+                    "PythonTorchTensorRTModule:SetupOutputAllocator"
+                )
+                if self.profiling_enabled
+                else nullcontext()
+            ):
+                self.create_output_allocator()
+                # need to set output allocator every run
+                for output_name in self.output_names:
+                    if not self.context.set_output_allocator(
+                        output_name, self.output_allocator
+                    ):
+                        raise RuntimeError(
+                            f"Failed to set output allocator for {output_name}"
+                        )
+
             with (
                 torch.autograd.profiler.record_function(
                     "PythonTorchTensorRTModule:TensorRTRuntime"
@@ -662,7 +668,7 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                     ]
                     logger.warning(f"Moved all input Tensors to cuda:{device_id}")
 
-            if self.contains_dds_layer:
+            if self.engine_is_dds:
                 return run_output_allocator()
             else:
                 return run_cuda_graph()
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -79,6 +79,7 @@ def __init__(
         name: str = "",
         settings: CompilationSettings = CompilationSettings(),  # Assumes engine was built with default compilation settings if object not passed
         weight_name_map: Optional[dict[Any, Any]] = None,
+        engine_is_dds: bool = False,
     ):
         """Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
         a PyTorch ``torch.nn.Module`` around it. Uses the Torch-TensorRT runtime extension to run the engines
@@ -97,6 +98,7 @@ def __init__(
             name (str): Name for module
             settings (torch_tensorrt.dynamo.CompilationSettings): Settings used to compile engine, assumes engine was built with default compilation settings if object not passed
             weight_name_map (dict): Mapping of engine weight name to state_dict weight name
+            engine_is_dds (bool): Whether the engine is Data Dependent Shape
 
         Example:
 
@@ -132,6 +134,7 @@ def __init__(
         self.weight_name_map = weight_name_map
         self.serialized_engine = serialized_engine
         self.engine = None
+        self.engine_is_dds = engine_is_dds
 
         if (
             serialized_engine
@@ -146,7 +149,11 @@ def _pack_engine_info(self) -> List[str | bytes]:
             if self.settings.device is not None
             else Device._current_device()
         )
-        metadata = {"settings": self.settings, "weight_name_map": self.weight_name_map}
+        metadata = {
+            "settings": self.settings,
+            "weight_name_map": self.weight_name_map,
+            "engine_is_dds": self.engine_is_dds,
+        }
         target_platform = (
             Platform.current_platform()
             if not self.settings.enable_cross_compile_for_windows
@@ -263,6 +270,7 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
             metadata = TorchTensorRTModule.decode_metadata(serialized_metadata)
             self.settings = metadata["settings"]
             self.weight_name_map = metadata["weight_name_map"]
+            self.engine_is_dds = metadata["engine_is_dds"]
 
         else:
             self.engine = None
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
@@ -207,6 +207,7 @@ def run_test(
                 input_binding_names=list(interpreter_result.input_names),
                 output_binding_names=list(interpreter_result.output_names),
                 name="test_engine",
+                engine_is_dds=interpreter_result.engine_is_dds,
             )
             mod = mod.cuda()
             if pyt_inputs is not None:
@@ -289,6 +290,7 @@ def run_test_custom_compare_results(
                 input_binding_names=list(interpreter_result.input_names),
                 output_binding_names=list(interpreter_result.output_names),
                 name="test_engine",
+                engine_is_dds=interpreter_result.engine_is_dds,
             )
             res_trt = trt_mod(*cuda_inputs).cpu()
             res_cpu = mod(*cuda_inputs).cpu()
diff --git a/tests/py/dynamo/conversion/test_nonzero_aten.py b/tests/py/dynamo/conversion/test_nonzero_aten.py
@@ -19,8 +19,33 @@ class TestNonZeroConverter(DispatchTestCase):
     )
     def test_non_zero(self, input_shape, dtype):
         class NonZero(nn.Module):
+            # This is a DDS network
             def forward(self, input):
-                return torch.ops.aten.nonzero.default(input)
+                out = torch.ops.aten.nonzero.default(input)
+                return out
+
+        inputs = [torch.randint(low=0, high=3, size=input_shape, dtype=dtype)]
+        self.run_test(
+            NonZero(),
+            inputs,
+        )
+
+    @parameterized.expand(
+        [
+            ((10,), torch.int),
+            ((1, 20), torch.int32),
+            ((2, 3), torch.int64),
+            ((2, 3, 4), torch.float),
+            ((2, 3, 4, 5), torch.float),
+        ]
+    )
+    def test_non_zero(self, input_shape, dtype):
+        class NonZero(nn.Module):
+            # This is a static network
+            def forward(self, input):
+                out = torch.ops.aten.nonzero.default(input)
+                out = torch.ops.aten.sum.dim_IntList(out, 0)
+                return out
 
         inputs = [torch.randint(low=0, high=3, size=input_shape, dtype=dtype)]
         self.run_test(