Implement mul.Tensor to quant fusion.

eigen-k · facebook-github-bot · commit 7a2c8756d734 · 2025-06-11T16:10:04.000-07:00
Reviewed By: hsharma35

Differential Revision: D76302365
diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
@@ -862,6 +862,73 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         result = super().call(graph_module)
         return result
 
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class FuseMulTensorIntoQuantPass(ExportPass):
+    """
+    Looks for the pattern where aten.mul.Tensor is followed by quant node.
+    If found, updates the quant scale to reflect the multiplication and
+    removes the mul node.
+    """
+    def attempt_fusion(
+        self, graph_module: torch.fx.GraphModule, mul_node: torch.fx.Node
+    ) -> None:
+        if mul_node.target != exir_ops.edge.aten.mul.Tensor:
+            return
+
+        full_nodes = [
+            arg
+            for arg in mul_node.args
+            if isinstance(arg, torch.fx.Node)
+            and arg.target == exir_ops.edge.aten.full.default
+        ]
+
+        if len(full_nodes) != 1 or len(mul_node.users) != 1:
+            return
+
+        full_node = full_nodes[0]
+        mul_user = list(mul_node.users.keys())[0]
+
+        if mul_user.target not in {
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+        }:
+            return
+
+        quant_node = mul_user
+
+        # First create a copy of the current args
+        new_quant_args = list(quant_node.args)
+        assert isinstance(quant_node.args[1], Number)
+        assert isinstance(full_node.args[1], Number)
+        # pyre-ignore[58]: Unsupported operand *
+        new_scale = quant_node.args[1] * full_node.args[1]
+
+        logging.debug(
+            f"Fused {mul_node} and {full_node} into {quant_node}. Updated scale from {quant_node.args[1]} to {new_scale}"
+        )
+
+        # Replace the input first
+        quant_node.replace_input_with(cast(torch.fx.Node, quant_node.args[0]), cast(torch.fx.Node, mul_node.args[0]))
+
+        # Now update the scale in the args
+        new_quant_args = list(quant_node.args)
+        new_quant_args[1] = new_scale
+        quant_node.args = tuple(new_quant_args)
+
+        # Clean up the mul_node
+        mul_node.args = tuple()
+        mul_node.users = {}
+
+        graph_module.graph.erase_node(mul_node)
+        graph_module.graph.erase_node(full_node)
+        graph_module.recompile()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            self.attempt_fusion(graph_module, node)
+        result = super().call(graph_module)
+        return result
+
 
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 class FuseMulTensorIntoDequantPass(ExportPass):
diff --git a/backends/cadence/aot/tests/test_fusion_ops_passes.py b/backends/cadence/aot/tests/test_fusion_ops_passes.py
@@ -22,6 +22,7 @@
     FuseMulTensorIntoDequantPass,
     FuseQuantDequantToRequantizePass,
     FuseTransposeOrPermuteOpPairsPass,
+    FuseMulTensorIntoQuantPass,
 )
 from executorch.backends.cadence.aot.graph_builder import GraphBuilder
 from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
@@ -587,6 +588,48 @@ def test_fuse_mul_scalar_into_dequant(self):
                 deq_scale = node.args[1]
         self.assertEqual(deq_scale, dequant_scale * mul_value)
 
+    def test_fuse_mul_into_quant(self):
+        quant_scale = 1.5
+        mul_value = 10
+
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 32, dtype=torch.float32))
+        full = builder.call_operator(
+            op=exir_ops.edge.aten.full.default,
+            args=([1], mul_value),
+        )
+        mul = builder.call_operator(
+            op=exir_ops.edge.aten.mul.Tensor,
+            args=(x, full),
+        )
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(mul, quant_scale, 0, 0, 255, torch.uint8),
+        )
+        builder.output(quant)
+        graph_module = FuseMulTensorIntoQuantPass()(
+            builder.get_graph_module()
+        ).graph_module
+
+        # verify that the mul and full ops were removed
+        self.check_op_counts(
+            graph_module,
+            expected_op_counts={
+                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
+                exir_ops.edge.aten.full.default: 0,
+                exir_ops.edge.aten.mul.Tensor: 0,
+            },
+        )
+
+        # verify that the quant scale value was updated correctly
+        for node in graph_module.graph.nodes:
+            if (
+                node.target
+                == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+            ):
+                deq_scale = node.args[1]
+        self.assertEqual(deq_scale, quant_scale * mul_value)
+
     def test_fuse_then_transpose_pass(self):
         # Create a graph with full -> transpose.
         builder = GraphBuilder()