fix: Improve torch_tensorrt Dynamo path

gs-olive · gs-olive · commit eea388414ebb · 2023-04-07T22:42:58.000-07:00
- Add dedicated settings and defaults files to centralize data and
improve code readability, as well as reduce duplication of code
- Improve documentation of functions, types, and comments
- Rework logic to make compiler more uniform with existing torch
tensorrt compilers, while retaining key Dynamo keywords needed for
compilation via the torch.compile path
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -1,10 +1,18 @@
 import torch
 import logging
-from typing import Sequence, Any
+import torch_tensorrt
 
+from typing import Sequence, Any
 from torch_tensorrt import EngineCapability, Device
-
 from torch_tensorrt.dynamo import create_backend
+from torch_tensorrt.fx.utils import LowerPrecision
+
+from torch_tensorrt.dynamo._defaults import (
+    PRECISION,
+    DEBUG,
+    MAX_WORKSPACE_SIZE,
+)
+
 
 logger = logging.getLogger(__name__)
 
@@ -18,10 +26,10 @@ def compile(
     sparse_weights=False,
     enabled_precisions=set(),
     refit=False,
-    debug=False,
+    debug=DEBUG,
     capability=EngineCapability.default,
     num_avg_timing_iters=1,
-    workspace_size=20 << 30,
+    workspace_size=MAX_WORKSPACE_SIZE,
     dla_sram_size=1048576,
     dla_local_dram_size=1073741824,
     dla_global_dram_size=536870912,
@@ -31,26 +39,38 @@ def compile(
     min_block_size=3,
     torch_executed_ops=[],
     torch_executed_modules=[],
+    **kwargs,
 ):
+
+    logger.warn(
+        "The Dynamo backend is an experimental feature, for which only the "
+        + "following arguments are supported: "
+        + "{enabled_precisions, debug, workspace_size, max_num_trt_engines}"
+    )
+
+    if (
+        torch.float16 in enabled_precisions
+        or torch_tensorrt.dtype.half in enabled_precisions
+    ):
+        lower_precision = LowerPrecision.FP16
+    elif (
+        torch.float32 in enabled_precisions
+        or torch_tensorrt.dtype.float in enabled_precisions
+    ):
+        lower_precision = LowerPrecision.FP32
+    elif len(enabled_precisions) == 0:
+        logger.info(f"No precision specified, defaulting to {PRECISION}")
+        lower_precision = PRECISION
+    else:
+        raise ValueError(
+            f"Precision {enabled_precisions} not supported in the Dynamo Path"
+        )
+
     custom_backend = create_backend(
-        device=device,
-        disable_tf32=disable_tf32,
-        sparse_weights=sparse_weights,
-        enabled_precisions=enabled_precisions,
-        refit=refit,
+        precision=lower_precision,
         debug=debug,
-        capability=capability,
-        num_avg_timing_iters=num_avg_timing_iters,
         workspace_size=workspace_size,
-        dla_sram_size=dla_sram_size,
-        dla_local_dram_size=dla_local_dram_size,
-        dla_global_dram_size=dla_global_dram_size,
-        calibrator=calibrator,
-        truncate_long_and_double=truncate_long_and_double,
-        require_full_compilation=require_full_compilation,
-        min_block_size=min_block_size,
-        torch_executed_ops=torch_executed_ops,
-        torch_executed_modules=torch_executed_modules,
+        **kwargs,
     )
 
     model = torch.compile(gm, backend=custom_backend)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -0,0 +1,7 @@
+from torch_tensorrt.fx.utils import LowerPrecision
+
+
+PRECISION = LowerPrecision.FP32
+DEBUG = False
+MAX_WORKSPACE_SIZE = 20 << 30
+MAX_NUM_TRT_ENGINES = 10
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+
+from torch_tensorrt.fx.utils import LowerPrecision
+from torch_tensorrt.dynamo._defaults import (
+    PRECISION,
+    DEBUG,
+    MAX_WORKSPACE_SIZE,
+    MAX_NUM_TRT_ENGINES,
+)
+
+
+@dataclass(frozen=True)
+class CompilationSettings:
+    precision: LowerPrecision = (PRECISION,)
+    debug: bool = (DEBUG,)
+    workspace_size: int = (MAX_WORKSPACE_SIZE,)
+    max_num_trt_engines: int = (MAX_NUM_TRT_ENGINES,)
diff --git a/py/torch_tensorrt/dynamo/backends.py b/py/torch_tensorrt/dynamo/backends.py
@@ -3,9 +3,14 @@
 import traceback
 from functools import partial
 import torch._dynamo as td
-from torch_tensorrt import EngineCapability, Device
-from torch_tensorrt.dynamo import compile
 
+from torch_tensorrt.dynamo._defaults import (
+    PRECISION,
+    DEBUG,
+    MAX_WORKSPACE_SIZE,
+    MAX_NUM_TRT_ENGINES,
+)
+from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.lowering._decompositions import get_decompositions
 from torch_tensorrt.dynamo.lowering._partition import partition, get_submod_inputs
 from torch_tensorrt.dynamo.conversion import convert_module
@@ -14,55 +19,38 @@
 
 from torch._functorch.aot_autograd import aot_module_simplified, make_boxed_compiler
 
-from torch_tensorrt.fx.fx2trt import (
-    InputTensorSpec,
-    TRTInterpreter,
-)
-import tensorrt as trt
-
-from torch_tensorrt.fx.trt_module import TRTModule
 from torch_tensorrt.fx.utils import LowerPrecision
 
 logger = logging.getLogger(__name__)
 
 
 def create_backend(
-    input_signature=None,
-    device=Device._current_device(),
-    disable_tf32=False,
-    sparse_weights=False,
-    enabled_precisions=set(),
-    refit=False,
-    debug=False,
-    capability=EngineCapability.default,
-    num_avg_timing_iters=1,
-    workspace_size=20 << 30,
-    dla_sram_size=1048576,
-    dla_local_dram_size=1073741824,
-    dla_global_dram_size=536870912,
-    calibrator=None,
-    truncate_long_and_double=False,
-    require_full_compilation=False,
-    min_block_size=3,
-    torch_executed_ops=[],
-    torch_executed_modules=[],
+    precision: LowerPrecision = PRECISION,
+    debug: bool = DEBUG,
+    workspace_size: int = MAX_WORKSPACE_SIZE,
+    max_num_trt_engines: int = MAX_NUM_TRT_ENGINES,
+    **kwargs
 ):
-    logger.warn(
-        "The Dynamo backend is an experimental feature, for which the "
-        + "following arguments are unsupported: "
-        + "{input_signature, disable_tf32, sparse_weights, refit, capability, "
-        + "num_avg_timing_iters, dla_sram_size, dla_local_dram_size, "
-        + "dla_global_dram_size, calibrator, truncate_long_and_double, "
-        + "require_full_compilation, min_block_size, torch_executed_ops, "
-        + "torch_executed_modules}"
+    """Create torch.compile backend given specified arguments
+
+    Args:
+        precision:
+        debug: Whether to print out verbose debugging information
+        workspace_size: Maximum workspace TRT is allowed to use for the module
+        precision: Model Layer precision
+    Returns:
+        Backend for torch.compile
+    """
+    settings = CompilationSettings(
+        debug=debug,
+        precision=precision,
+        workspace_size=workspace_size,
+        max_num_trt_engines=max_num_trt_engines,
     )
 
     return partial(
         tensorrt_backend,
-        debug=debug,
-        enabled_precisions=enabled_precisions,
-        device=device,
-        workspace_size=workspace_size,
+        settings=settings,
     )
 
 
@@ -71,19 +59,12 @@ def create_backend(
 def tensorrt_backend(
     gm: torch.Module,
     sample_inputs,
-    *,
-    debug=False,
-    enabled_precisions=set(),
-    device=Device._current_device(),
-    workspace_size=20 << 30,
+    settings: CompilationSettings = CompilationSettings(),
 ):
 
     custom_backend = partial(
         fx_dynamo_backend,
-        debug=debug,
-        enabled_precisions=enabled_precisions,
-        device=device,
-        workspace_size=workspace_size,
+        settings=settings,
     )
 
     # Invoke AOTAutograd to translate operators to aten
@@ -100,15 +81,15 @@ def tensorrt_backend(
 def fx_dynamo_backend(
     gm: torch.fx.GraphModule,
     example_inputs,
-    *,
-    debug=False,
-    enabled_precisions=set(),
-    device=Device._current_device(),
-    workspace_size=20 << 30,
+    settings: CompilationSettings = CompilationSettings(),
 ):
     """Helper function to manage translation of FX module to TRT engines"""
     try:
-        trt_compiled = compile_module(gm, example_inputs)
+        trt_compiled = compile_module(
+            gm,
+            example_inputs,
+            settings=settings,
+        )
         return trt_compiled
     except:
         traceback.print_exc()
@@ -122,22 +103,23 @@ def fx_dynamo_backend(
 def compile_module(
     gm: torch.fx.GraphModule,
     example_inputs,
-    debug: bool = False,
-    workspace_size: int = 20 << 30,
-    precision: LowerPrecision = LowerPrecision.FP32,
+    settings: CompilationSettings = CompilationSettings(),
 ) -> torch.fx.GraphModule:
-    """Convert an FX module to a TRT module
+    """Compile an FX module
+
+    Includes: Partitioning + Conversion Phases
+
     Args:
         module: FX GraphModule to convert
         inputs: Inputs to the module
-        debug: Whether to print out verbose debugging information
-        workspace_size: Maximum workspace TRT is allowed to use for the module
-        precision: Model Layer precision
+        settings: Compilation settings
     Returns:
-        TRTModule or TRTModuleNext
+        Compiled FX GraphModule
     """
     # Partition module into components that can be TRT-accelerated
-    partitioned_module = partition(gm)
+    partitioned_module = partition(
+        gm, verbose=settings.debug, max_num_trt_engines=settings.max_num_trt_engines
+    )
 
     # Iterate over all components that can be accelerated
     # Generate the corresponding TRT Module for those
@@ -153,9 +135,9 @@ def compile_module(
         trt_mod = convert_module(
             submodule,
             submodule_inputs,
-            debug=debug,
-            workspace_size=workspace_size,
-            precision=precision,
+            debug=settings.debug,
+            workspace_size=settings.workspace_size,
+            precision=settings.precision,
         )
 
         # Replace FX Module with TRT Module
diff --git a/py/torch_tensorrt/dynamo/lowering/_partition.py b/py/torch_tensorrt/dynamo/lowering/_partition.py
@@ -2,15 +2,13 @@
 
 import torch
 
+from torch_tensorrt.dynamo._defaults import MAX_NUM_TRT_ENGINES
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch.fx.passes.operator_support import OperatorSupport
 
 from torch_tensorrt.fx.converter_registry import CONVERTERS
 
 
-MAX_NUM_TRT_ENGINES = 10
-
-
 class TorchTensorRTOperatorSupport(OperatorSupport):
     """Class to determine whether operators within a module are supported"""