Add fuse_dq_q_pass in exir/passes and also add it to HTP backend (#2295)

tarun292 · facebook-github-bot · commit 68d29f76bb5d · 2024-03-12T23:15:14.000-07:00
Summary: There are passes such as these https://fburl.com/code/vs6n4vcv that remove noops from the graph. The problem is that after this pass runs it still leaves in the dq->q nodes in the graph which then potentially get delegated to the backend causing perf regressions. This pass will remove the dq->q ops if their qparams are of the same value. If not it won't touch them. Differential Revision: D54543323
diff --git a/exir/passes/remove_noop_pass.py b/exir/passes/remove_noop_pass.py
@@ -6,38 +6,81 @@
 
 # pyre-strict
 
+from typing import Tuple
+
 import torch
-from executorch.exir.pass_base import ExportPass, ProxyValue
-from torch.utils import _pytree as pytree
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx import GraphModule
+
+_DEQUANT_OPS: Tuple[torch._ops.OpOverload] = (
+    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+    torch.ops.quantized_decomposed.dequantize_per_channel.default,
+)
+_QUANT_OPS: Tuple[torch._ops.OpOverload] = (
+    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+    torch.ops.quantized_decomposed.quantize_per_channel.default,
+)
 
 
 class RemoveNoopPass(ExportPass):
     """
     Removes noops that pass through arguments.
     """
 
-    # pyre-ignore
-    def call_operator(self, op, args, kwargs, meta):
-        if op not in (
-            torch.ops.aten.to.dtype,
-            torch.ops.aten.dropout.default,
-            torch.ops.aten.slice_copy.Tensor,
-        ):
-            return super().call_operator(op, args, kwargs, meta)
-
-        args_data, kwargs_data = pytree.tree_map_only(
-            ProxyValue, lambda x: x.data, (args, kwargs)
-        )
-        orig_tensor = (
-            args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
-        )
-
-        if orig_tensor is op(*args_data, **kwargs_data):
-            return args[0]
-
-        if op == torch.ops.aten.slice_copy.Tensor:
-            result = op(*args_data, **kwargs_data)
-            if orig_tensor.size() == result.size():
-                return args[0]
-
-        return super().call_operator(op, args, kwargs, meta)
+    def remove_quantized_op(
+        self, graph_module: GraphModule, node: torch.fx.Node
+    ) -> None:
+        node_input = list(node.args)[0]
+
+        if not isinstance(node_input, torch.fx.Node):
+            return
+
+        # Let's assume that when entering this section of code the graph pattern is as follows:
+        # Node A -> DQ -> slice_copy -> Q -> Node B. If the qparams of the DQ and Q are the same,
+        # then after this the graph will look like this:
+        # Node A -> Node B.
+        if node_input.target in _DEQUANT_OPS:
+            for user in list(node.users):
+                if user.target in _QUANT_OPS:
+                    # Drop the input arg and check that the qparams are the same.
+                    qparams_dq = list(node_input.args)[1:]
+                    qparams_q = list(user.args)[1:]
+                    if qparams_dq != qparams_q:
+                        return
+                    user.replace_all_uses_with(node_input.args[0])
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+
+            if node.target not in (
+                torch.ops.aten.to.dtype,
+                torch.ops.aten.dropout.default,
+                torch.ops.aten.slice_copy.Tensor,
+            ):
+                continue
+
+            orig_tensor = node.args[0].meta["val"]
+
+            if orig_tensor is node.meta["val"]:
+                # If the graph is quantized, we must remove the entire pattern consisting of dq->op->q.
+                # Otherwise, removing only the op will suffice.
+                if node.args[0].target in _DEQUANT_OPS:
+                    self.remove_quantized_op(graph_module, node)
+                else:
+                    node.replace_all_uses_with(node.args[0])
+                continue
+
+            if node.target == torch.ops.aten.slice_copy.Tensor:
+                if orig_tensor.size() == node.meta["val"].size():
+                    # If the graph is quantized, we must remove the entire pattern consisting of dq->op->q.
+                    # Otherwise, removing only the op will suffice.
+                    if node.args[0].target in _DEQUANT_OPS:
+                        self.remove_quantized_op(graph_module, node)
+                    else:
+                        node.replace_all_uses_with(node.args[0])
+
+        graph_module.graph.lint()
+        graph_module.graph.eliminate_dead_code()
+        return PassResult(graph_module, True)
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
@@ -50,6 +50,12 @@
 from functorch.experimental import control_flow
 
 from torch import nn
+
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
 from torch.export import export
 from torch.fx import GraphModule, subgraph_rewriter
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -1244,3 +1250,67 @@ def forward(self, x):
         #     %copy__default : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%arg0_1, %aten_add_tensor_1), kwargs = {})
         #     return (copy__default, aten_add_tensor)
         self.assertEqual(count_copies(gm), 1)
+
+    def test_remove_quantized_op_noop_pass(self) -> None:
+        class TestAddSlice(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = x + x
+                x = x + x[:]
+                return x
+
+        def count_dq_nodes(gm: torch.fx.GraphModule) -> int:
+            return sum(
+                (
+                    node.target
+                    in (
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+                    )
+                )
+                for node in gm.graph.nodes
+            )
+
+        def count_q_nodes(gm: torch.fx.GraphModule) -> int:
+            return sum(
+                (
+                    node.target
+                    in (
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+                    )
+                )
+                for node in gm.graph.nodes
+            )
+
+        example_inputs = (torch.randn(9, 8),)
+        model = TestAddSlice()
+        m_eager = model.eval()
+
+        # program capture
+        m = torch._export.capture_pre_autograd_graph(
+            m_eager,
+            example_inputs,
+        )
+
+        quantizer = XNNPACKQuantizer()
+        quantization_config = get_symmetric_quantization_config()
+        quantizer.set_global(quantization_config)
+        m = prepare_pt2e(m, quantizer)
+        m = convert_pt2e(m, fold_quantize=True)
+        ep = torch.export.export(m, example_inputs)
+        dq_nodes_pre = count_dq_nodes(ep.graph_module)
+        q_nodes_pre = count_q_nodes(ep.graph_module)
+        edge = to_edge(ep, compile_config=EdgeCompileConfig(_check_ir_validity=False))
+
+        dq_nodes_post = count_dq_nodes(edge.exported_program().graph_module)
+        q_nodes_post = count_q_nodes(edge.exported_program().graph_module)
+        # One dq and one q node around the slice copy should have been removed.
+        self.assertEqual(dq_nodes_pre - dq_nodes_post, 1)
+        self.assertEqual(q_nodes_pre - q_nodes_post, 1)
+
+        # Check that the slice_copy is removed by the RemoveNoopPass.
+        for node in edge.exported_program().graph_module.graph.nodes:
+            self.assertFalse("slice" in str(node.target))