feat: cherry-pick of Selectively enable different frontends (#2693) (#2761)

peri044 · narendasan · zewenli98 · commit 30f50949fecb · 2024-04-25T18:57:42.000-07:00
Signed-off-by: Naren Dasan &lt;naren@narendasan.com&gt;
Signed-off-by: Naren Dasan &lt;narens@nvidia.com&gt;
Co-authored-by: Naren Dasan &lt;1790613+narendasan@users.noreply.github.com&gt;
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -264,7 +264,6 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       script: |
         export USE_HOST_DEPS=1
-        export LD_LIBRARY_PATH=/opt/torch-tensorrt-builds/TensorRT-10.0.0.6/lib:$LD_LIBRARY_PATH
         pushd .
         cd tests/py/core
         ${CONDA_RUN} python -m pip install --pre pytest-xdist timm transformers parameterized expecttest==0.1.6 --use-deprecated=legacy-resolver
diff --git a/py/torch_tensorrt/_Device.py b/py/torch_tensorrt/_Device.py
@@ -9,12 +9,11 @@
 else:
     from typing_extensions import Self
 
+import tensorrt as trt
 import torch
 from torch_tensorrt._enums import DeviceType
 from torch_tensorrt._features import ENABLED_FEATURES
 
-import tensorrt as trt
-
 
 class Device(object):
     """
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
@@ -7,8 +7,6 @@
 
 import torch
 import torch.fx
-import torch_tensorrt.dynamo
-import torch_tensorrt.ts
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
@@ -343,18 +341,8 @@ def convert_method_to_trt_engine(
             "convert_method_to_trt_engine call is not supported for ir=fx"
         )
     elif target_ir == _IRType.dynamo:
-        # Prepare torch and torchtrt inputs
-        from torch_tensorrt.dynamo.utils import prepare_inputs
-
-        if not isinstance(inputs, collections.abc.Sequence):
-            inputs = [inputs]
-
-        # Export the module
-        torchtrt_inputs = prepare_inputs(inputs)
-        exp_program = torch_tensorrt.dynamo.trace(module, torchtrt_inputs, **kwargs)
-
         return dynamo_convert_module_to_trt_engine(  # type: ignore[no-any-return]
-            exp_program,
+            module,
             inputs=inputs,
             enabled_precisions=enabled_precisions_set,
             **kwargs,
diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py
@@ -107,7 +107,7 @@ def _from(
                 return dtype.f16
             elif t == trt.float32:
                 return dtype.f32
-            elif t == trt.bool:
+            elif trt.__version__ >= "7.0" and t == trt.bool:
                 return dtype.b
             else:
                 raise TypeError(
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -10,33 +10,7 @@
 from torch_tensorrt._Device import Device
 from torch_tensorrt._enums import EngineCapability, dtype
 from torch_tensorrt._Input import Input
-from torch_tensorrt.dynamo import partitioning
-from torch_tensorrt.dynamo._defaults import (
-    DEBUG,
-    DEVICE,
-    DISABLE_TF32,
-    DLA_GLOBAL_DRAM_SIZE,
-    DLA_LOCAL_DRAM_SIZE,
-    DLA_SRAM_SIZE,
-    DRYRUN,
-    ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
-    ENGINE_CAPABILITY,
-    HARDWARE_COMPATIBLE,
-    MAX_AUX_STREAMS,
-    MIN_BLOCK_SIZE,
-    NUM_AVG_TIMING_ITERS,
-    OPTIMIZATION_LEVEL,
-    PASS_THROUGH_BUILD_FAILURES,
-    PRECISION,
-    REFIT,
-    REQUIRE_FULL_COMPILATION,
-    SPARSE_WEIGHTS,
-    TRUNCATE_LONG_AND_DOUBLE,
-    USE_FAST_PARTITIONER,
-    USE_PYTHON_RUNTIME,
-    VERSION_COMPATIBLE,
-    WORKSPACE_SIZE,
-)
+from torch_tensorrt.dynamo import _defaults, partitioning
 from torch_tensorrt.dynamo._DryRunTracker import (
     DryRunTracker,
     PerSubgraphData,
@@ -89,15 +63,15 @@ def compile(
     min_block_size: int = _defaults.MIN_BLOCK_SIZE,
     torch_executed_ops: Optional[Collection[Target]] = None,
     torch_executed_modules: Optional[List[str]] = None,
-    pass_through_build_failures: bool = PASS_THROUGH_BUILD_FAILURES,
-    max_aux_streams: Optional[int] = MAX_AUX_STREAMS,
-    version_compatible: bool = VERSION_COMPATIBLE,
-    optimization_level: Optional[int] = OPTIMIZATION_LEVEL,
-    use_python_runtime: bool = USE_PYTHON_RUNTIME,
-    use_fast_partitioner: bool = USE_FAST_PARTITIONER,
-    enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
-    dryrun: bool = DRYRUN,
-    hardware_compatible: bool = HARDWARE_COMPATIBLE,
+    pass_through_build_failures: bool = _defaults.PASS_THROUGH_BUILD_FAILURES,
+    max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS,
+    version_compatible: bool = _defaults.VERSION_COMPATIBLE,
+    optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL,
+    use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME,
+    use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER,
+    enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
+    dryrun: bool = _defaults.DRYRUN,
+    hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile a TorchScript module for NVIDIA GPUs using TensorRT
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -26,6 +26,7 @@
 REQUIRE_FULL_COMPILATION = False
 DRYRUN = False
 HARDWARE_COMPATIBLE = False
+SUPPORTED_KERNEL_PRECISIONS = {dtype.f32, dtype.f16, dtype.i8}
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -313,7 +313,7 @@ def run(
         )
         timing_cache = self._create_timing_cache(builder_config, existing_cache)
 
-        engine = self.builder.build_serialized_network(self.ctx.net, builder_config)
+        engine = self.builder.build_engine(self.ctx.net, builder_config)
         assert engine
 
         serialized_cache = (
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -38,6 +38,8 @@ def infer_module_output_dtypes(
     # such as aten.sum - such outputs can be truncated
     output_dtypes = []
     for output in module_outputs:
+        if not isinstance(output, torch.Tensor):
+            output = torch.tensor(output)
         if truncate_long_and_double and output.dtype == dtype.float64:
             output_dtypes.append(dtype.float32)
         elif truncate_long_and_double and output.dtype == dtype.int64:
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/cast.py b/py/torch_tensorrt/dynamo/conversion/impl/cast.py
@@ -2,6 +2,7 @@
 from typing import Optional, Union
 
 import numpy as np
+import tensorrt as trt
 import torch
 from torch.fx.node import Target
 from torch_tensorrt import _enums
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py
@@ -1,5 +1,6 @@
 from typing import Optional
 
+import tensorrt as trt
 import torch
 from torch.fx.node import Target
 from torch_tensorrt import _enums
@@ -9,8 +10,6 @@
 from torch_tensorrt.fx.converters.converter_utils import broadcast, set_layer_name
 from torch_tensorrt.fx.types import TRTTensor
 
-import tensorrt as trt
-
 
 def matrix_multiply(
     ctx: ConversionContext,
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -15,7 +15,6 @@
     _select_rt_device,
     multi_gpu_device_check,
 )
-from torch_tensorrt.logging import TRT_LOGGER
 
 logger = logging.getLogger(__name__)
 
@@ -65,19 +64,35 @@ def _initialize(self) -> None:
         ) == (len(self.input_names) + len(self.output_names))
 
         self.input_dtypes = [
-            dtype._from(self.engine.get_tensor_dtype(input_name))
-            for input_name in self.input_names
+            dtype._from(self.engine.get_binding_dtype(idx))
+            for idx in self.input_binding_indices_in_order
         ]
         self.input_shapes = [
             self.engine.get_tensor_shape(input_name) for input_name in self.input_names
         ]
         self.output_dtypes = [
-            dtype._from(self.engine.get_tensor_dtype(output_name))
-            for output_name in self.output_names
+            dtype._from(self.engine.get_binding_dtype(idx))
+            for idx in self.output_binding_indices_in_order
         ]
         self.output_shapes = [
-            self.engine.get_tensor_shape(output_name)
-            for output_name in self.output_names
+            (
+                tuple(self.engine.get_binding_shape(idx))
+                if self.engine.has_implicit_batch_dimension
+                else tuple()
+            )
+            for idx in self.output_binding_indices_in_order
+        ]
+        self.hidden_output_dtypes = [
+            dtype._from(self.engine.get_binding_dtype(idx))
+            for idx in self.hidden_output_binding_indices_in_order
+        ]
+        self.hidden_output_shapes = [
+            (
+                tuple(self.engine.get_binding_shape(idx))
+                if self.engine.has_implicit_batch_dimension
+                else tuple()
+            )
+            for idx in self.hidden_output_binding_indices_in_order
         ]
 
     def _check_initialized(self) -> None:
@@ -219,11 +234,15 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                     bindings.append(output.data_ptr())
                     outputs.append(output)
 
-            # Assign tensor address appropriately
-            for idx in range(self.engine.num_io_tensors):
-                self.context.set_tensor_address(
-                    self.engine.get_tensor_name(idx), bindings[idx]
-                )
+                for i, idx in enumerate(self.hidden_output_binding_indices_in_order):
+                    shape = tuple(self.context.get_binding_shape(idx))
+
+                    output = torch.empty(
+                        size=shape,
+                        dtype=self.hidden_output_dtypes[i].to(torch.dtype),
+                        device=torch.cuda.current_device(),
+                    )
+                    bindings[idx] = output.data_ptr()
 
             with (
                 torch.autograd.profiler.record_function(
diff --git a/py/torch_tensorrt/logging.py b/py/torch_tensorrt/logging.py
@@ -1,9 +1,8 @@
 import logging
 from typing import Any
 
-from torch_tensorrt._features import ENABLED_FEATURES
-
 import tensorrt as trt
+from torch_tensorrt._features import ENABLED_FEATURES
 
 logging.captureWarnings(True)
 _LOGGER = logging.getLogger("torch_tensorrt [TensorRT Conversion Context]")
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
@@ -13,8 +13,6 @@
 from torch_tensorrt.ts._Input import TorchScriptInput
 from torch_tensorrt.ts.logging import Level, log
 
-import tensorrt as trt
-
 
 def _internal_input_to_torch_class_input(i: _C.Input) -> torch.classes.tensorrt._Input:
     clone = torch.classes.tensorrt._Input()
diff --git a/py/torch_tensorrt/ts/_enums.py b/py/torch_tensorrt/ts/_enums.py
@@ -1,3 +1,2 @@
-from torch_tensorrt._C import EngineCapability, TensorFormat, dtype  # noqa: F401
-
 from tensorrt import DeviceType  # noqa: F401
+from torch_tensorrt._C import EngineCapability, TensorFormat, dtype  # noqa: F401
diff --git a/tests/py/core/test_classes.py b/tests/py/core/test_classes.py
@@ -2,13 +2,12 @@
 import unittest
 from typing import Dict
 
+import tensorrt as trt
 import torch
 import torch_tensorrt as torchtrt
 import torchvision.models as models
 from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import TorchTensorRTModule
 
-import tensorrt as trt
-
 
 class TestDevice(unittest.TestCase):
     def test_from_string_constructor(self):
diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py
@@ -1,3 +1,5 @@
+import unittest
+
 import torch
 import torch_tensorrt
 from torch.testing._internal.common_utils import TestCase, run_tests
diff --git a/tests/py/dynamo/runtime/test_hw_compat.py b/tests/py/dynamo/runtime/test_hw_compat.py
@@ -74,9 +74,6 @@ def forward(self, x):
         not torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8,
         "HW Compatibility is not supported on cards older than Ampere",
     )
-    @unittest.skip(
-        "Skipping this test because the hw_compat.ts can't be generated using torch nightly"
-    )
     def test_hw_compat_3080_build(self):
         inputs = [torch.randn(1, 3, 224, 224).cuda()]
 
diff --git a/tests/py/ts/ptq/test_ptq_trt_calibrator.py b/tests/py/ts/ptq/test_ptq_trt_calibrator.py
@@ -1,6 +1,7 @@
 import os
 import unittest
 
+import tensorrt as trt
 import torch
 import torch.nn as nn
 import torch_tensorrt as torchtrt
@@ -9,8 +10,6 @@
 from torch.nn import functional as F
 from torch_tensorrt.ts.logging import *
 
-import tensorrt as trt
-
 
 def find_repo_root(max_depth=10):
     dir_path = os.path.dirname(os.path.realpath(__file__))

Original file line number	Diff line number	Diff line change
`@@ -313,7 +313,7 @@ def run(`
`313`	`313`	`)`
`314`	`314`	`timing_cache = self._create_timing_cache(builder_config, existing_cache)`
`315`	`315`
`316`		`- engine = self.builder.build_serialized_network(self.ctx.net, builder_config)`
	`316`	`+ engine = self.builder.build_engine(self.ctx.net, builder_config)`
`317`	`317`	`assert engine`
`318`	`318`
`319`	`319`	`serialized_cache = (`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+import unittest`
	`2`	`+`
`1`	`3`	`import torch`
`2`	`4`	`import torch_tensorrt`
`3`	`5`	`from torch.testing._internal.common_utils import TestCase, run_tests`
Original file line number	Diff line number	Diff line change
`@@ -74,9 +74,6 @@ def forward(self, x):`
`74`	`74`	`not torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8,`
`75`	`75`	`"HW Compatibility is not supported on cards older than Ampere",`
`76`	`76`	`)`
`77`		`- @unittest.skip(`
`78`		`- "Skipping this test because the hw_compat.ts can't be generated using torch nightly"`
`79`		`- )`
`80`	`77`	`def test_hw_compat_3080_build(self):`
`81`	`78`	`inputs = [torch.randn(1, 3, 224, 224).cuda()]`
`82`	`79`