First step of refactor lower passes (#74219)

Shirong Wu · Wei Wei · commit 6a7f4dbc1fb5 · 2022-06-03T17:54:11.000-07:00
Summary: X-link: pytorch/pytorch#74219 Pull Request resolved: https://github.com/pytorch/fx2trt/pull/18 This is beginning diff for refactor and clean up stuff in lowering process and pass management. Reviewed By: yinghai Differential Revision: D34764123 fbshipit-source-id: 57fa930abefae56654219225167b52d7dd79e03a
diff --git a/fx/lower.py b/fx/lower.py
@@ -21,10 +21,8 @@
 from .input_tensor_spec import (
     InputTensorSpec,
 )
-from .passes.fuse_pass import (
-    fuse_permute_linear,
-    fuse_permute_matmul,
-)
+from .passes.pass_utils import chain_passes, PassFunc
+from .passes.lower_basic_pass import fuse_permute_matmul,fuse_permute_linear
 from .passes.remove_duplicate_output_args import (
     remove_duplicate_output_args,
 )
@@ -74,9 +72,6 @@ class PassContext(NamedTuple):
     lower_setting: "LowerSetting"
     module_name: str = ""
 
-# Function signature for a graph module pass
-PassFunc = Callable[[nn.Module, PassContext], Tuple[nn.Module, PassContext]]
-
 
 def lower_to_trt(
     module: nn.Module,
@@ -85,7 +80,6 @@ def lower_to_trt(
     max_workspace_size=1 << 25,
     explicit_batch_dimension=False,
     fp16_mode=True,
-    enable_fuse=True,
     verbose_log=False,
     timing_cache_prefix="",
     save_timing_cache=False,
@@ -102,8 +96,6 @@ def lower_to_trt(
         max_workspace_size: Maximum size of workspace given to TensorRT.
         explicit_batch_dimension: Use explicit batch dimension in TensorRT if set True, otherwise use implicit batch dimension.
         fp16_mode: fp16 config given to TRTModule.
-        enable_fuse: Enable pass fusion during lowering if set to true. l=Lowering will try to find pattern defined
-        in fx2trt_oss.fx.passes from original module, and replace with optimized pass before apply lowering.
         verbose_log: Enable verbose log for TensorRT if set True.
         timing_cache_prefix: Timing cache file name for timing cache used by fx2trt.
         save_timing_cache: Update timing cache with current timing cache data if set to True.
@@ -117,7 +109,6 @@ def lower_to_trt(
         max_workspace_size=max_workspace_size,
         explicit_batch_dimension=explicit_batch_dimension,
         fp16_mode=fp16_mode,
-        enable_fuse=enable_fuse,
         verbose_log=verbose_log,
         timing_cache_prefix=timing_cache_prefix,
         save_timing_cache=save_timing_cache,
@@ -153,14 +144,12 @@ class LowerSetting:
     strict_type_constraints: Require TensorRT engine to strictly follow data type
     setting at execution time.
 
-    enable_fuse: Enable pass fuse duirng lowering, i.e. fuse multiple operations
-    as (a->b->c->d)=>(e). Current available fuse source patterns are:
-    sparse->matmul->add
+    customized_fuse_pass: List of custmozied pass to apply during lowering process.
+
+    lower_basic_fuse_pass: Enable basic pass fuse duirng lowering, i.e. fuse multiple operations
+    as (a->b->c->d)=>(e). Current basic fuse patterns are:
     permute->linear
     permute->matmul
-    unsqueeze->cat->sum
-
-    enable_fuse_for_sparsity: Enable pass fuse for sparsity.
 
     verbose_log: Enable TensorRT engine verbose log mode.
 
@@ -191,8 +180,8 @@ class LowerSetting:
     int8_mode: bool = False
     max_workspace_size: int = 1 << 30
     strict_type_constraints: bool = False
-    enable_fuse: bool = True
-    enable_fuse_for_sparsity = False
+    customized_fuse_pass: Sequence = ()
+    lower_basic_fuse_pass: Sequence = (fuse_permute_matmul,fuse_permute_linear)
     verbose_log: bool = False
     algo_selector = None
     timing_cache_prefix: str = ""
@@ -249,10 +238,10 @@ def __call__(self, mod, input, split_name) -> TRTInterpreterResult:
             if self.lower_setting.input_specs
             else InputTensorSpec.from_tensors(input)
         )
-        if self.lower_setting.enable_fuse:
-            mod = fuse_permute_matmul(mod)
-            mod = fuse_permute_linear(mod)
-            FUSE_PASSES_POST_OBSERVER.observe(mod, input)
+
+        if self.lower_setting.lower_basic_fuse_pass:
+            lower_pass = chain_passes(*self.lower_setting.lower_basic_fuse_pass)
+            lower_pass(mod, input)
 
         # Prepare algorithm selector and timing_cache for TRTInterpreter
         algo_selector = None
@@ -363,14 +352,16 @@ def __call__(
             inputs = tuple(x.half() if x.dtype == torch.float32 else x for x in inputs)
 
         # Ensure ast_rewrite is done for input module before const_fold.
-        traced_mod = self.trace_func(module, inputs)  # type: ignore[misc]
+        tracer = chain_passes(self.trace_func, *self.lower_setting.customized_fuse_pass)
+        traced_mod = tracer(module, inputs)  # type: ignore[misc]
 
         # Run const folding.
         traced_mod = run_const_fold(traced_mod)
 
         # Retrace here to eliminate no-op introduced by const folding and map new introduced
         # nodes to acc op nodes.
-        traced_mod = self.trace_func(traced_mod, inputs)  # type: ignore[misc]
+        traced_mod = tracer(traced_mod, inputs)  # type: ignore[misc]
+        FUSE_PASSES_POST_OBSERVER.observe(traced_mod, inputs)
 
         # Run split.
         split_result = self.split_func(traced_mod, inputs, self.lower_setting)  # type: ignore[misc,operator]
diff --git a/fx/passes/lower_basic_pass.py b/fx/passes/lower_basic_pass.py
@@ -7,9 +7,16 @@
     get_attr,
 )
 from fx2trt_oss.fx.observer import observable
+from fx2trt_oss.fx.passes.pass_utils import log_before_after, validate_inference
+from typing import Any
 
+# Create an alias for module input type to avoid littering pyre-ignore for Any
+# throughout the file.
+Input = Any
 
-def fuse_sparse_matmul_add(gm: torch.fx.GraphModule):
+@log_before_after
+@validate_inference(atol=1e-3, rtol=1e-2)
+def fuse_sparse_matmul_add(gm: torch.fx.GraphModule, input: Input):
     """
     Replace acc_ops.matmul + acc_ops.add with acc_ops.linear
     TRT8.2 can take advantage of structured sparsity (2:4), but the graph needs contain a single FC layer.
@@ -100,7 +107,9 @@ def check_permute(node: torch.fx.Node):
 
 
 @observable()
-def fuse_permute_linear(gm: torch.fx.GraphModule):
+@log_before_after
+@validate_inference(atol=1e-3, rtol=1e-2)
+def fuse_permute_linear(gm: torch.fx.GraphModule, input: Input):
     """
     Fuse pattern like permute + linear if permute is transposing the last two dimension.
     """
@@ -122,7 +131,9 @@ def fuse_permute_linear(gm: torch.fx.GraphModule):
 
 
 @observable()
-def fuse_permute_matmul(gm: torch.fx.GraphModule):
+@log_before_after
+@validate_inference(atol=1e-3, rtol=1e-2)
+def fuse_permute_matmul(gm: torch.fx.GraphModule, input: Input):
     """
     Fuse pattern like permute + matmul if permute is transposing the last two dimension.
     """
@@ -150,7 +161,6 @@ def fuse_permute_matmul(gm: torch.fx.GraphModule):
     gm.recompile()
     return gm
 
-
 try:
     # @manual=//deeplearning/trt/python:py_tensorrt
     import tensorrt as trt
diff --git a/fx/passes/pass_utils.py b/fx/passes/pass_utils.py
@@ -0,0 +1,98 @@
+from typing import List, Any, Callable
+from torch import fx
+import logging
+import torch
+import tempfile
+from functools import wraps
+from torch.fx.passes.shape_prop import ShapeProp
+
+# Create an alias for module input type to avoid littering pyre-ignore for Any
+# throughout the file.
+Input = Any
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+PassFunc = Callable[[fx.GraphModule, Input], fx.GraphModule]
+
+def chain_passes(*passes: PassFunc) -> PassFunc:
+    """
+    Chains a sequence of pass functions to form a single pass function
+    """
+
+    def parent_pass(module: fx.GraphModule, input: Input) -> fx.GraphModule:
+        for pass_ in passes:
+            if isinstance(module, torch.fx.GraphModule):
+                ShapeProp(module).propagate(*input)
+            module = pass_(module, input)
+        return module
+
+    return parent_pass
+
+
+def validate_inference(rtol=None, atol=None):
+    def _validate_inference(pass_: PassFunc) -> PassFunc:
+        """
+        Wraps a pass function to validate that its inference results before and
+        after the pass run should be `allclose`.
+        """
+
+        @wraps(pass_)
+        def pass_with_validation(
+            module: fx.GraphModule, input: Input
+        ) -> fx.GraphModule:
+            res0 = module(*input)
+            module = pass_(module, input)
+            res1 = module(*input)
+
+            tensor_res_0 = _collect_tensors(res0)
+            tensor_res_1 = _collect_tensors(res1)
+
+            for kk, (x, y) in enumerate(zip(tensor_res_0, tensor_res_1)):
+                kwargs = {}
+                if rtol:
+                    kwargs["rtol"] = rtol
+                if atol:
+                    kwargs["atol"] = atol
+                assert torch.allclose(
+                    x, y, **kwargs
+                ), f"pass {pass_} failed correctness check due to output {kk}"
+            return module
+
+        return pass_with_validation
+
+    return _validate_inference
+
+
+def log_before_after(pass_: PassFunc) -> PassFunc:
+    """
+    Wraps a pass function to log the module graph before and after the pass
+    """
+
+    @wraps(pass_)
+    def pass_with_before_after_log(
+        module: fx.GraphModule, input: Input
+    ) -> fx.GraphModule:
+        with tempfile.NamedTemporaryFile(
+            mode="w",
+            encoding="utf-8",
+            delete=False,
+        ) as f:
+            print(f"== Log pass {pass_} before/after graph to {f.name}")
+            print(f"[{pass_}] Before:\n{module.graph}", file=f)
+            module = pass_(module, input)
+            print(f"[{pass_}] After:\n{module.graph}", file=f)
+            return module
+
+    return pass_with_before_after_log
+
+
+def _collect_tensors(arg: fx.node.Argument) -> List[torch.Tensor]:
+    """Collects all the tensors found in a nested container object"""
+    res: List[torch.Tensor] = []
+
+    def collect(x: fx.node.Argument) -> fx.node.Argument:
+        if isinstance(x, torch.Tensor):
+            res.append(x)
+        return x
+
+    fx.node.map_aggregate(arg, collect)
+    return res
diff --git a/test/passes/test_fuse_permute_linear_trt.py b/test/passes/test_fuse_permute_linear_trt.py
@@ -3,7 +3,7 @@
 import torch
 import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops
 from torch.testing._internal.common_fx2trt import AccTestCase
-from fx2trt_oss.fx.passes.fuse_pass import (
+from fx2trt_oss.fx.passes.lower_basic_pass import (
     fuse_permute_linear,
     trt_transposed_linear,
 )
diff --git a/test/passes/test_fuse_permute_matmul_trt.py b/test/passes/test_fuse_permute_matmul_trt.py
@@ -4,7 +4,7 @@
 import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops
 from torch.testing._internal.common_fx2trt import AccTestCase
 from parameterized import parameterized, param
-from fx2trt_oss.fx.passes.fuse_pass import (
+from fx2trt_oss.fx.passes.lower_basic_pass import (
     fuse_permute_matmul,
     trt_transposed_matmul,
 )
diff --git a/test/passes/test_multi_fuse_trt.py b/test/passes/test_multi_fuse_trt.py
@@ -4,7 +4,7 @@
 import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops
 from torch.testing._internal.common_fx2trt import AccTestCase
 from parameterized import parameterized
-from fx2trt_oss.fx.passes.fuse_pass import (
+from fx2trt_oss.fx.passes.lower_basic_pass import (
     fuse_permute_linear,
     trt_transposed_linear,
     fuse_permute_matmul,
diff --git a/test/trt_lower/test_observer_gpu.py b/test/trt_lower/test_observer_gpu.py
@@ -29,7 +29,7 @@ def forward(self, x, y):
 
         with execution_verifier() as verify_execution:
 
-            lowerer = lower.Lowerer.create(lower_setting=lower.LowerSetting(enable_fuse=True))
+            lowerer = lower.Lowerer.create(lower_setting=lower.LowerSetting())
             # Update `lowerer.split_func` to make sure the test model is split
             # onto the trt partition:
             lowerer = replace(

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`import torch`
`4`	`4`	`import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops`
`5`	`5`	`from torch.testing._internal.common_fx2trt import AccTestCase`
`6`		`-from fx2trt_oss.fx.passes.fuse_pass import (`
	`6`	`+from fx2trt_oss.fx.passes.lower_basic_pass import (`
`7`	`7`	`fuse_permute_linear,`
`8`	`8`	`trt_transposed_linear,`
`9`	`9`	`)`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops`
`5`	`5`	`from torch.testing._internal.common_fx2trt import AccTestCase`
`6`	`6`	`from parameterized import parameterized, param`
`7`		`-from fx2trt_oss.fx.passes.fuse_pass import (`
	`7`	`+from fx2trt_oss.fx.passes.lower_basic_pass import (`
`8`	`8`	`fuse_permute_matmul,`
`9`	`9`	`trt_transposed_matmul,`
`10`	`10`	`)`