[WIP][fx2trt] Replacing fp16 and int8 mode with enum type (#74338)

Mengchi Zhang · Wei Wei · commit 12e12cc1b202 · 2022-06-03T17:54:11.000-07:00
Summary: X-link: pytorch/pytorch#74338 Pull Request resolved: https://github.com/pytorch/fx2trt/pull/24 X-link: pytorch/benchmark#805 Reviewed By: jasonjk-park Differential Revision: D34929680 fbshipit-source-id: 8d693ffdfc28d12f5b88aba170da87b192c5e5a2
diff --git a/fx/example/lower_example.py b/fx/example/lower_example.py
@@ -5,6 +5,7 @@
 import torch
 import torchvision
 from fx2trt_oss.fx import lower_to_trt
+from fx2trt_oss.fx.utils import LowerPrecision
 
 
 """
@@ -167,7 +168,7 @@ def run_configuration_benchmark(
         time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))
     elif not conf.jit:
         # Run lowering eager mode benchmark
-        lowered_module = lower_to_trt(module, input, max_batch_size=conf.batch_size, fp16_mode=conf.fp16)
+        lowered_module = lower_to_trt(module, input, max_batch_size=conf.batch_size, lower_precision=LowerPrecision.FP16 if conf.fp16 else LowerPrecision.FP32)
         time = benchmark_torch_function(conf.batch_iter, lambda: lowered_module(*input))
     else:
         print("Lowering with JIT is not available!", "red")
diff --git a/fx/example/quantized_resnet_test.py b/fx/example/quantized_resnet_test.py
@@ -1,6 +1,7 @@
 import torch.fx
 import torchvision.models as models
 from fx2trt_oss.fx import TRTInterpreter, InputTensorSpec, TRTModule
+from fx2trt_oss.fx.utils import LowerPrecision
 from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
 import fx2trt_oss.tracer.acc_tracer.acc_tracer as acc_tracer
 import copy
@@ -16,7 +17,7 @@ def build_fp16_trt(rn18):
     rn18 = acc_tracer.trace(rn18, [torch.randn(1, 3, 224, 224)])
     interp = TRTInterpreter(
         rn18, [InputTensorSpec(torch.Size([3, 224, 224]), torch.float, has_batch_dim=False)])
-    interpreter_result = interp.run(fp16_mode=True)
+    interpreter_result = interp.run(lower_precision=LowerPrecision.FP16)
     return TRTModule(interpreter_result.engine, interpreter_result.input_names, interpreter_result.output_names)
 
 @torch.no_grad()
@@ -47,7 +48,7 @@ def build_int8_trt(rn18):
         [InputTensorSpec(torch.Size([-1, *data.shape[1:]]), torch.float,
                          shape_ranges=[((1, 3, 224, 224), (5, 3, 224, 224), (10, 3, 224, 224))], has_batch_dim=True)],
         explicit_batch_dimension=True, explicit_precision=True, logger_level=trt.Logger.VERBOSE)
-    interpreter_result = interp.run(fp16_mode=False, int8_mode=True)
+    interpreter_result = interp.run(lower_precision=LowerPrecision.INT8)
     trt_mod = TRTModule(interpreter_result.engine, interpreter_result.input_names, interpreter_result.output_names)
     trt_res = trt_mod(data.cuda())
     print("explicit quant result diff max", torch.max(ref_res - trt_res.cpu()))
@@ -75,7 +76,7 @@ def build_int8_trt_implicit_quant(rn18):
     shape_prop.ShapeProp(traced_rn18).propagate(data)
     traced_rn18 = NormalizeArgs(traced_rn18).transform()
     interp = TRTInterpreter(traced_rn18, InputTensorSpec.from_tensors([data]), logger_level=trt.Logger.VERBOSE)
-    interpreter_result = interp.run(fp16_mode=False, int8_mode=True, strict_type_constraints=True)
+    interpreter_result = interp.run(lower_precision=LowerPrecision.INT8, strict_type_constraints=True)
     trt_mod = TRTModule(interpreter_result.engine, interpreter_result.input_names, interpreter_result.output_names)
     trt_res = trt_mod(data.cuda())
     print("implicit quant result diff max", torch.max(ref_res - trt_res.cpu()))
diff --git a/fx/fx2trt.py b/fx/fx2trt.py
@@ -13,7 +13,7 @@
 
 from .converter_registry import CONVERTERS
 from .input_tensor_spec import InputTensorSpec
-from .utils import torch_dtype_to_trt, get_dynamic_dims
+from .utils import torch_dtype_to_trt, get_dynamic_dims, LowerPrecision
 
 TRT_INTERPRETER_CALL_PRE_OBSERVER: Observer[Callable[[torch.fx.GraphModule], None]] = Observer("TRT_INTERPRETER_CALL_PRE_OBSERVER")
 
@@ -146,27 +146,24 @@ def run(
         self,
         max_batch_size=64,
         max_workspace_size=1 << 25,
-        fp16_mode=True,
-        int8_mode=False,
+        lower_precision=LowerPrecision.FP16,
         sparse_weights=False,
         force_fp32_output=False,
         strict_type_constraints=False,
         algorithm_selector=None,
         timing_cache=None,
         profiling_verbosity=None,
     ) -> TRTInterpreterResult:
-        assert not (fp16_mode and int8_mode), "We cannot enable both fp16 and int8 mode."
-
         TRT_INTERPRETER_CALL_PRE_OBSERVER.observe(self.module)
 
-        # For float outputs, we set their dtype to fp16 only if fp16_mode=True and
+        # For float outputs, we set their dtype to fp16 only if LowerPrecision.FP16 and
         # force_fp32_output=False.
-        self.output_fp16 = not force_fp32_output and fp16_mode
+        self.output_fp16 = not force_fp32_output and lower_precision == LowerPrecision.FP16
 
-        if int8_mode and not self.builder.platform_has_fast_int8:
+        if lower_precision == LowerPrecision.INT8 and not self.builder.platform_has_fast_int8:
             raise RuntimeError("Current platform doesn't support fast native int8!")
 
-        if fp16_mode and not self.builder.platform_has_fast_fp16:
+        if lower_precision == LowerPrecision.FP16 and not self.builder.platform_has_fast_fp16:
             warnings.warn("Current platform doesn't support fast native fp16!")
 
         self.input_specs_iter = 0
@@ -188,10 +185,10 @@ def run(
             builder_config.profiling_verbosity = profiling_verbosity \
                 if profiling_verbosity else \
                 trt.ProfilingVerbosity.LAYER_NAMES_ONLY
-        if fp16_mode:
+        if lower_precision == LowerPrecision.FP16:
             builder_config.set_flag(trt.BuilderFlag.FP16)
 
-        if int8_mode:
+        if lower_precision == LowerPrecision.INT8:
             builder_config.set_flag(trt.BuilderFlag.INT8)
 
         if sparse_weights:
diff --git a/fx/lower.py b/fx/lower.py
@@ -33,6 +33,7 @@
 from .trt_module import (
     TRTModule,
 )
+from .utils import LowerPrecision
 
 
 logger = logging.getLogger(__name__)
@@ -79,8 +80,7 @@ def lower_to_trt(
     max_batch_size: int = 2048,
     max_workspace_size=1 << 25,
     explicit_batch_dimension=False,
-    fp16_mode=True,
-    int8_mode=False,
+    lower_precision=LowerPrecision.FP16,
     verbose_log=False,
     timing_cache_prefix="",
     save_timing_cache=False,
@@ -96,8 +96,7 @@ def lower_to_trt(
         max_batch_size: Maximum batch size (must be >= 1 to be set, 0 means not set)
         max_workspace_size: Maximum size of workspace given to TensorRT.
         explicit_batch_dimension: Use explicit batch dimension in TensorRT if set True, otherwise use implicit batch dimension.
-        fp16_mode: fp16 config given to TRTModule.
-        int8_mode: int8 config given to TRTModule.
+        lower_precision: lower precision config given to TRTModule. Can select between fp32, fp16 and int8.
         verbose_log: Enable verbose log for TensorRT if set True.
         timing_cache_prefix: Timing cache file name for timing cache used by fx2trt.
         save_timing_cache: Update timing cache with current timing cache data if set to True.
@@ -110,8 +109,7 @@ def lower_to_trt(
         max_batch_size=max_batch_size,
         max_workspace_size=max_workspace_size,
         explicit_batch_dimension=explicit_batch_dimension,
-        fp16_mode=fp16_mode,
-        int8_mode=int8_mode,
+        lower_precision=lower_precision,
         verbose_log=verbose_log,
         timing_cache_prefix=timing_cache_prefix,
         save_timing_cache=save_timing_cache,
@@ -137,9 +135,7 @@ class LowerSetting:
 
     explicit_precision: Use explicit precision during lowering.
 
-    fp16_mode: Enable FP16 dtype during lowering.
-
-    int8_mode: Enable Int8 dtype during lowering.
+    lower_precision: lower_precision during lowering. Can select between fp32, fp16 and int8.
 
     max_workspace_size: The maximum workspace size. The maximum GPU temporary
     memory which the TensorRT engine can use at execution time.
@@ -179,8 +175,7 @@ class LowerSetting:
     input_specs: List[InputTensorSpec] = dc.field(default_factory=list)
     explicit_batch_dimension: bool = True
     explicit_precision: bool = False
-    fp16_mode: bool = False
-    int8_mode: bool = False
+    lower_precision: LowerPrecision = LowerPrecision.FP32
     max_workspace_size: int = 1 << 30
     strict_type_constraints: bool = False
     customized_fuse_pass: Sequence = ()
@@ -271,8 +266,7 @@ def __call__(self, mod, input, split_name) -> TRTInterpreterResult:
         interp_result: TRTInterpreterResult = interpreter.run(
             max_batch_size=self.lower_setting.max_batch_size,
             max_workspace_size=self.lower_setting.max_workspace_size,
-            fp16_mode=self.lower_setting.fp16_mode,
-            int8_mode=self.lower_setting.int8_mode,
+            lower_precision=self.lower_setting.lower_precision,
             strict_type_constraints=self.lower_setting.strict_type_constraints,
             algorithm_selector=algo_selector,
             timing_cache=cache_data,
@@ -350,7 +344,7 @@ def __call__(
     ) -> nn.Module:
         module.eval()
 
-        if self.lower_setting.fp16_mode:
+        if self.lower_setting.lower_precision == LowerPrecision.FP16:
             module.half()
             inputs = tuple(x.half() if x.dtype == torch.float32 else x for x in inputs)
 
diff --git a/fx/utils.py b/fx/utils.py
@@ -5,6 +5,13 @@
 import torch
 
 from .types import Shape, TRTDataType
+from enum import Enum
+
+
+class LowerPrecision(Enum):
+    FP32 = "fp32"
+    FP16 = "fp16"
+    INT8 = "int8"
 
 
 def torch_dtype_to_trt(dtype: torch.dtype) -> TRTDataType:
diff --git a/test/quant/test_quant_trt.py b/test/quant/test_quant_trt.py
@@ -16,6 +16,7 @@
     TRTModule,
 )
 from fx2trt_oss.fx.lower import run_const_fold
+from fx2trt_oss.fx.utils import LowerPrecision
 from fx2trt_oss.tracer.acc_tracer import acc_ops
 from torch.ao.quantization import default_qconfig
 from torch.ao.quantization._quantize_fx_do_not_use import (
@@ -53,7 +54,7 @@ def lower_to_trt(model, inputs, shape_ranges):
         model,
         input_specs,
         explicit_batch_dimension=True, explicit_precision=True)
-    result = interp.run(fp16_mode=False, int8_mode=True)
+    result = interp.run(lower_precision=LowerPrecision.INT8)
     trt_mod = TRTModule(result.engine, result.input_names, result.output_names)
     return trt_mod