Add fuse_dq_q_pass in exir/passes and also add it to HTP backend (#2295)

tarun292 · facebook-github-bot · commit f004294dcc6a · 2024-03-07T10:00:48.000-08:00
Summary: There are passes such as these https://fburl.com/code/vs6n4vcv that remove noops from the graph. The problem is that after this pass runs it still leaves in the dq->q nodes in the graph which then potentially get delegated to the backend causing perf regressions. This pass will remove the dq->q ops if their qparams are of the same value. If not it won't touch them. Differential Revision: D54543323
diff --git a/exir/passes/TARGETS b/exir/passes/TARGETS
@@ -10,6 +10,7 @@ python_library(
     deps = [
         ":const_prop_pass",
         ":debug_handle_generator_pass",
+        ":fuse_dq_q_pass",
         ":insert_write_back_for_buffers_pass",
         ":memory_format_ops_pass",
         ":memory_planning_pass",
@@ -299,3 +300,15 @@ python_library(
         "//executorch/exir/dialects/edge:lib",
     ],
 )
+
+python_library(
+    name = "fuse_dq_q_pass",
+    srcs = [
+        "fuse_dq_q_pass.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+    ],
+)
diff --git a/exir/passes/fuse_dq_q_pass.py b/exir/passes/fuse_dq_q_pass.py
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+from torch.fx import GraphModule
+from torch.fx.passes.infra.pass_base import PassResult
+
+
+class FuseDQandQPass(ExportPass):
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if (
+                node.target
+                == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+            ):
+                if all(
+                    user.target
+                    == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+                    for user in list(node.users)
+                ):
+                    for user in list(node.users):
+                        # Drop the input arg and check that the qparams are the same.
+                        qparams_dq = list(node.args)[1:]
+                        qparams_q = list(user.args)[1:]
+                        if qparams_dq != qparams_q:
+                            continue
+                        user.replace_all_uses_with(node.args[0])
+
+        graph_module.graph.lint()
+        graph_module.graph.eliminate_dead_code()
+        return PassResult(graph_module, True)
diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
@@ -213,6 +213,7 @@ python_unittest(
         "//executorch/exir/emit:lib",
         "//executorch/exir/passes:constant_prop_pass",
         "//executorch/exir/passes:debug_handle_generator_pass",
+        "//executorch/exir/passes:fuse_dq_q_pass",
         "//executorch/exir/passes:insert_write_back_for_buffers_pass",
         "//executorch/exir/passes:lib",
         "//executorch/exir/passes:remove_graph_asserts_pass",
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
@@ -33,6 +33,7 @@
 )
 from executorch.exir.passes.constant_prop_pass import constant_prop_pass
 from executorch.exir.passes.debug_handle_generator_pass import DebugHandleGeneratorPass
+from executorch.exir.passes.fuse_dq_q_pass import FuseDQandQPass
 from executorch.exir.passes.insert_write_back_for_buffers_pass import (
     insert_write_back_for_buffers_pass,
 )
@@ -50,6 +51,12 @@
 from functorch.experimental import control_flow
 
 from torch import nn
+
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
 from torch.export import export
 from torch.fx import GraphModule, subgraph_rewriter
 from torch.fx.experimental.proxy_tensor import make_fx
@@ -1244,3 +1251,45 @@ def forward(self, x):
         #     %copy__default : [num_users=1] = call_function[target=torch.ops.aten.copy_.default](args = (%arg0_1, %aten_add_tensor_1), kwargs = {})
         #     return (copy__default, aten_add_tensor)
         self.assertEqual(count_copies(gm), 1)
+
+    def test_dq_q_fusion_pass(self) -> None:
+        class TestLinearAdd(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(8, 16)
+
+            def forward(self, x):
+                x1 = self.linear1(x)
+                x1 = x1 + x1
+                return x1
+
+        example_inputs = (torch.randn(9, 8),)
+        model = TestLinearAdd()
+        m_eager = model.eval()
+
+        # program capture
+        m = torch._export.capture_pre_autograd_graph(
+            m_eager,
+            example_inputs,
+        )
+
+        quantizer = XNNPACKQuantizer()
+        quantization_config = get_symmetric_quantization_config()
+        quantizer.set_global(quantization_config)
+        m = prepare_pt2e(m, quantizer)
+        m = convert_pt2e(m, fold_quantize=True)
+        ep = torch.export.export(m, example_inputs)
+
+        edge = to_edge(ep, compile_config=EdgeCompileConfig(_check_ir_validity=False))
+        for node in edge.exported_program().graph_module.graph.nodes:
+            # Remove add node so that we can test the transform pass which should
+            # remove the dq and q nodes.
+            if "add" in node.name:
+                node.replace_all_uses_with(node.args[0])
+        edge.exported_program().graph_module.graph.eliminate_dead_code()
+
+        len_pre_transform = len(edge.exported_program().graph_module.graph.nodes)
+        edge.transform([FuseDQandQPass()])
+        len_post_transform = len(edge.exported_program().graph_module.graph.nodes)
+        # As one dq and one q node are removed, the number of nodes should be reduced by 2.
+        self.assertEqual(len_pre_transform - len_post_transform, 2)