Revert D34929680: Multisect successfully blamed D34929680 for test failures (#74381)

Dark Knight · Wei Wei · commit 18f74b5ba6f9 · 2022-06-03T17:54:11.000-07:00
Summary: X-link: pytorch/pytorch#74381 X-link: pytorch/benchmark#808 Pull Request resolved: https://github.com/pytorch/fx2trt/pull/25 Reviewed By: brad-mengchi Differential Revision: D34966585 fbshipit-source-id: a1eea214ba6c9a7c04dd9d327f339bc1c739b0ae
diff --git a/fx/example/lower_example.py b/fx/example/lower_example.py
@@ -5,7 +5,6 @@
 import torch
 import torchvision
 from fx2trt_oss.fx import lower_to_trt
-from fx2trt_oss.fx.utils import LowerPrecision
 
 
 """
@@ -168,7 +167,7 @@ def run_configuration_benchmark(
         time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))
     elif not conf.jit:
         # Run lowering eager mode benchmark
-        lowered_module = lower_to_trt(module, input, max_batch_size=conf.batch_size, lower_precision=LowerPrecision.FP16 if conf.fp16 else LowerPrecision.FP32)
+        lowered_module = lower_to_trt(module, input, max_batch_size=conf.batch_size, fp16_mode=conf.fp16)
         time = benchmark_torch_function(conf.batch_iter, lambda: lowered_module(*input))
     else:
         print("Lowering with JIT is not available!", "red")
diff --git a/fx/example/quantized_resnet_test.py b/fx/example/quantized_resnet_test.py
@@ -1,7 +1,6 @@
 import torch.fx
 import torchvision.models as models
 from fx2trt_oss.fx import TRTInterpreter, InputTensorSpec, TRTModule
-from fx2trt_oss.fx.utils import LowerPrecision
 from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
 import fx2trt_oss.tracer.acc_tracer.acc_tracer as acc_tracer
 import copy
@@ -17,7 +16,7 @@ def build_fp16_trt(rn18):
     rn18 = acc_tracer.trace(rn18, [torch.randn(1, 3, 224, 224)])
     interp = TRTInterpreter(
         rn18, [InputTensorSpec(torch.Size([3, 224, 224]), torch.float, has_batch_dim=False)])
-    interpreter_result = interp.run(lower_precision=LowerPrecision.FP16)
+    interpreter_result = interp.run(fp16_mode=True)
     return TRTModule(interpreter_result.engine, interpreter_result.input_names, interpreter_result.output_names)
 
 @torch.no_grad()
@@ -48,7 +47,7 @@ def build_int8_trt(rn18):
         [InputTensorSpec(torch.Size([-1, *data.shape[1:]]), torch.float,
                          shape_ranges=[((1, 3, 224, 224), (5, 3, 224, 224), (10, 3, 224, 224))], has_batch_dim=True)],
         explicit_batch_dimension=True, explicit_precision=True, logger_level=trt.Logger.VERBOSE)
-    interpreter_result = interp.run(lower_precision=LowerPrecision.INT8)
+    interpreter_result = interp.run(fp16_mode=False, int8_mode=True)
     trt_mod = TRTModule(interpreter_result.engine, interpreter_result.input_names, interpreter_result.output_names)
     trt_res = trt_mod(data.cuda())
     print("explicit quant result diff max", torch.max(ref_res - trt_res.cpu()))
@@ -76,7 +75,7 @@ def build_int8_trt_implicit_quant(rn18):
     shape_prop.ShapeProp(traced_rn18).propagate(data)
     traced_rn18 = NormalizeArgs(traced_rn18).transform()
     interp = TRTInterpreter(traced_rn18, InputTensorSpec.from_tensors([data]), logger_level=trt.Logger.VERBOSE)
-    interpreter_result = interp.run(lower_precision=LowerPrecision.INT8, strict_type_constraints=True)
+    interpreter_result = interp.run(fp16_mode=False, int8_mode=True, strict_type_constraints=True)
     trt_mod = TRTModule(interpreter_result.engine, interpreter_result.input_names, interpreter_result.output_names)
     trt_res = trt_mod(data.cuda())
     print("implicit quant result diff max", torch.max(ref_res - trt_res.cpu()))
diff --git a/fx/fx2trt.py b/fx/fx2trt.py
@@ -13,7 +13,7 @@
 
 from .converter_registry import CONVERTERS
 from .input_tensor_spec import InputTensorSpec
-from .utils import torch_dtype_to_trt, get_dynamic_dims, LowerPrecision
+from .utils import torch_dtype_to_trt, get_dynamic_dims
 
 TRT_INTERPRETER_CALL_PRE_OBSERVER: Observer[Callable[[torch.fx.GraphModule], None]] = Observer("TRT_INTERPRETER_CALL_PRE_OBSERVER")
 
@@ -146,24 +146,27 @@ def run(
         self,
         max_batch_size=64,
         max_workspace_size=1 << 25,
-        lower_precision=LowerPrecision.FP16,
+        fp16_mode=True,
+        int8_mode=False,
         sparse_weights=False,
         force_fp32_output=False,
         strict_type_constraints=False,
         algorithm_selector=None,
         timing_cache=None,
         profiling_verbosity=None,
     ) -> TRTInterpreterResult:
+        assert not (fp16_mode and int8_mode), "We cannot enable both fp16 and int8 mode."
+
         TRT_INTERPRETER_CALL_PRE_OBSERVER.observe(self.module)
 
-        # For float outputs, we set their dtype to fp16 only if LowerPrecision.FP16 and
+        # For float outputs, we set their dtype to fp16 only if fp16_mode=True and
         # force_fp32_output=False.
-        self.output_fp16 = not force_fp32_output and lower_precision == LowerPrecision.FP16
+        self.output_fp16 = not force_fp32_output and fp16_mode
 
-        if lower_precision == LowerPrecision.INT8 and not self.builder.platform_has_fast_int8:
+        if int8_mode and not self.builder.platform_has_fast_int8:
             raise RuntimeError("Current platform doesn't support fast native int8!")
 
-        if lower_precision == LowerPrecision.FP16 and not self.builder.platform_has_fast_fp16:
+        if fp16_mode and not self.builder.platform_has_fast_fp16:
             warnings.warn("Current platform doesn't support fast native fp16!")
 
         self.input_specs_iter = 0
@@ -185,10 +188,10 @@ def run(
             builder_config.profiling_verbosity = profiling_verbosity \
                 if profiling_verbosity else \
                 trt.ProfilingVerbosity.LAYER_NAMES_ONLY
-        if lower_precision == LowerPrecision.FP16:
+        if fp16_mode:
             builder_config.set_flag(trt.BuilderFlag.FP16)
 
-        if lower_precision == LowerPrecision.INT8:
+        if int8_mode:
             builder_config.set_flag(trt.BuilderFlag.INT8)
 
         if sparse_weights:
diff --git a/fx/lower.py b/fx/lower.py
@@ -33,7 +33,6 @@
 from .trt_module import (
     TRTModule,
 )
-from .utils import LowerPrecision
 
 
 logger = logging.getLogger(__name__)
@@ -80,7 +79,8 @@ def lower_to_trt(
     max_batch_size: int = 2048,
     max_workspace_size=1 << 25,
     explicit_batch_dimension=False,
-    lower_precision=LowerPrecision.FP16,
+    fp16_mode=True,
+    int8_mode=False,
     verbose_log=False,
     timing_cache_prefix="",
     save_timing_cache=False,
@@ -96,7 +96,8 @@ def lower_to_trt(
         max_batch_size: Maximum batch size (must be >= 1 to be set, 0 means not set)
         max_workspace_size: Maximum size of workspace given to TensorRT.
         explicit_batch_dimension: Use explicit batch dimension in TensorRT if set True, otherwise use implicit batch dimension.
-        lower_precision: lower precision config given to TRTModule. Can select between fp32, fp16 and int8.
+        fp16_mode: fp16 config given to TRTModule.
+        int8_mode: int8 config given to TRTModule.
         verbose_log: Enable verbose log for TensorRT if set True.
         timing_cache_prefix: Timing cache file name for timing cache used by fx2trt.
         save_timing_cache: Update timing cache with current timing cache data if set to True.
@@ -109,7 +110,8 @@ def lower_to_trt(
         max_batch_size=max_batch_size,
         max_workspace_size=max_workspace_size,
         explicit_batch_dimension=explicit_batch_dimension,
-        lower_precision=lower_precision,
+        fp16_mode=fp16_mode,
+        int8_mode=int8_mode,
         verbose_log=verbose_log,
         timing_cache_prefix=timing_cache_prefix,
         save_timing_cache=save_timing_cache,
@@ -135,7 +137,9 @@ class LowerSetting:
 
     explicit_precision: Use explicit precision during lowering.
 
-    lower_precision: lower_precision during lowering. Can select between fp32, fp16 and int8.
+    fp16_mode: Enable FP16 dtype during lowering.
+
+    int8_mode: Enable Int8 dtype during lowering.
 
     max_workspace_size: The maximum workspace size. The maximum GPU temporary
     memory which the TensorRT engine can use at execution time.
@@ -175,7 +179,8 @@ class LowerSetting:
     input_specs: List[InputTensorSpec] = dc.field(default_factory=list)
     explicit_batch_dimension: bool = True
     explicit_precision: bool = False
-    lower_precision: LowerPrecision = LowerPrecision.FP32
+    fp16_mode: bool = False
+    int8_mode: bool = False
     max_workspace_size: int = 1 << 30
     strict_type_constraints: bool = False
     customized_fuse_pass: Sequence = ()
@@ -266,7 +271,8 @@ def __call__(self, mod, input, split_name) -> TRTInterpreterResult:
         interp_result: TRTInterpreterResult = interpreter.run(
             max_batch_size=self.lower_setting.max_batch_size,
             max_workspace_size=self.lower_setting.max_workspace_size,
-            lower_precision=self.lower_setting.lower_precision,
+            fp16_mode=self.lower_setting.fp16_mode,
+            int8_mode=self.lower_setting.int8_mode,
             strict_type_constraints=self.lower_setting.strict_type_constraints,
             algorithm_selector=algo_selector,
             timing_cache=cache_data,
@@ -344,7 +350,7 @@ def __call__(
     ) -> nn.Module:
         module.eval()
 
-        if self.lower_setting.lower_precision == LowerPrecision.FP16:
+        if self.lower_setting.fp16_mode:
             module.half()
             inputs = tuple(x.half() if x.dtype == torch.float32 else x for x in inputs)
 
diff --git a/fx/utils.py b/fx/utils.py
@@ -5,13 +5,6 @@
 import torch
 
 from .types import Shape, TRTDataType
-from enum import Enum
-
-
-class LowerPrecision(Enum):
-    FP32 = "fp32"
-    FP16 = "fp16"
-    INT8 = "int8"
 
 
 def torch_dtype_to_trt(dtype: torch.dtype) -> TRTDataType:
diff --git a/test/quant/test_quant_trt.py b/test/quant/test_quant_trt.py
@@ -16,7 +16,6 @@
     TRTModule,
 )
 from fx2trt_oss.fx.lower import run_const_fold
-from fx2trt_oss.fx.utils import LowerPrecision
 from fx2trt_oss.tracer.acc_tracer import acc_ops
 from torch.ao.quantization import default_qconfig
 from torch.ao.quantization.quantize_fx import (
@@ -54,7 +53,7 @@ def lower_to_trt(model, inputs, shape_ranges):
         model,
         input_specs,
         explicit_batch_dimension=True, explicit_precision=True)
-    result = interp.run(lower_precision=LowerPrecision.INT8)
+    result = interp.run(fp16_mode=False, int8_mode=True)
     trt_mod = TRTModule(result.engine, result.input_names, result.output_names)
     return trt_mod