pytorch
diff --git a/‎.lintrunner.toml
Lines changed: 1 addition & 0 deletions b/‎.lintrunner.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_quantizer.py
Lines changed: 2 additions & 2 deletions b/‎backends/apple/coreml/test/test_coreml_quantizer.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 2 additions & 2 deletions b/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/__init__.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_reciprocal.py
Lines changed: 79 additions & 0 deletions b/‎backends/arm/operators/op_reciprocal.py
Lines changed: 79 additions & 0 deletions
diff --git a/‎backends/arm/passes/arm_pass_manager.py
Lines changed: 10 additions & 0 deletions b/‎backends/arm/passes/arm_pass_manager.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/arm/passes/arm_pass_utils.py
Lines changed: 66 additions & 0 deletions b/‎backends/arm/passes/arm_pass_utils.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎backends/arm/passes/convert_expand_copy_to_repeat.py
Lines changed: 22 additions & 40 deletions b/‎backends/arm/passes/convert_expand_copy_to_repeat.py
Lines changed: 22 additions & 40 deletions
diff --git a/‎backends/arm/passes/convert_split_to_slice.py
Lines changed: 5 additions & 4 deletions b/‎backends/arm/passes/convert_split_to_slice.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎backends/arm/passes/decompose_div_pass.py
Lines changed: 45 additions & 0 deletions b/‎backends/arm/passes/decompose_div_pass.py
Lines changed: 45 additions & 0 deletions
@@ -180,6 +180,7 @@ exclude_patterns = [
     '**/*.bat',
     '**/*.jpg',
     '**/*.jar',
+    '**/*.gif',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
 
@@ -15,12 +15,12 @@
 )
 
 from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
-from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.quantize_pt2e import (
     convert_pt2e,
     prepare_pt2e,
     prepare_qat_pt2e,
 )
+from torch.export import export_for_training
 
 
 class TestCoreMLQuantizer:
@@ -32,7 +32,7 @@ def quantize_and_compare(
     ) -> None:
         assert quantization_type in {"PTQ", "QAT"}
 
-        pre_autograd_aten_dialect = capture_pre_autograd_graph(model, example_inputs)
+        pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
 
         quantization_config = LinearQuantizerConfig.from_dict(
             {
 
@@ -209,9 +209,9 @@ def lower_module_and_test_output(
 
         expected_output = model(*sample_inputs)
 
-        model = torch._export.capture_pre_autograd_graph(
+        model = torch.export.export_for_training(
             model, sample_inputs, dynamic_shapes=dynamic_shapes
-        )
+        ).module()
 
         edge_program = export_to_edge(
             model,
 
@@ -57,6 +57,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.sigmoid.default,
             exir_ops.edge.aten.mm.default,
             exir_ops.edge.aten.repeat.default,
+            exir_ops.edge.aten.reciprocal.default,
             exir_ops.edge.aten.relu.default,
             exir_ops.edge.aten.rsqrt.default,
             exir_ops.edge.aten._softmax.default,
 
@@ -15,7 +15,6 @@
     op_cat,
     op_conv2d,
     op_dequant,
-    op_div,
     op_exp,
     op_full,
     op_get_item,
@@ -26,6 +25,7 @@
     op_mul,
     op_permute,
     op_quant,
+    op_reciprocal,
     op_relu,
     op_repeat,
     op_rsqrt,
 
@@ -0,0 +1,79 @@
+# Copyright 2023-2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+
+import numpy as np
+
+import serializer.tosa_serializer as ts
+import torch
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_quant_utils import (
+    dequantize_value,
+    get_quant_node_args,
+    QuantArgs,
+    quantize_value,
+)
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class DivVisitor(NodeVisitor):
+    target = "aten.reciprocal.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        # 1/X
+
+        if is_quant_node:
+            input = inputs[0]
+            input_qargs = get_quant_node_args(node.all_input_nodes[0])
+            output_qargs = get_quant_node_args(list(node.users)[0])
+
+            div_table = div_table_8bit(input_qargs, output_qargs)
+
+            table_attr = ts.TosaSerializerAttribute()
+            table_attr.TableAttribute(div_table)
+            tosa_graph.addOperator(
+                TosaOp.Op().TABLE, [input.name], [output.name], table_attr
+            )
+
+        else:
+            tosa_graph.addOperator(
+                TosaOp.Op().RECIPROCAL, [inputs[0].name], [output.name]
+            )
+
+
+def div_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs):
+    """
+    Returns a table mapping 256 entries to div([qmin,qmax])
+    """
+
+    def div(x):
+        # Convert quantized input to floating point div input space.
+        v1 = dequantize_value(x, in_quantargs)
+        # Compute div.
+        v2 = 1.0 / v1
+        # Convert div output back to quantized space.
+        v3 = quantize_value(v2, out_quantargs)
+
+        return v3
+
+    return [
+        div(x)
+        for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8)
+    ]
@@ -17,10 +17,14 @@
 from executorch.backends.arm.passes.convert_split_to_slice import (
     ConvertSplitToSlicePass,
 )
+from executorch.backends.arm.passes.decompose_div_pass import DecomposeDivPass
 from executorch.backends.arm.passes.meandim_to_averagepool_pass import (
     ConvertMeanDimToAveragePool,
 )
 from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass
+from executorch.backends.arm.passes.scalars_to_attribute_pass import (
+    ScalarsToAttributePass,
+)
 from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass
 from executorch.exir import ExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -40,6 +44,7 @@ def transform_to_backend_pipeline(
         self.add_pass(RemoveClonePass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(ConvertMeanDimToAveragePool())
+        self.add_pass(DecomposeDivPass())
         self.add_pass(ConvertSplitToSlicePass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
@@ -48,3 +53,8 @@ def transform_to_backend_pipeline(
                     self.add_pass(AnnotateChannelsLastDimOrder())
 
         return self._transform(exported_program.graph_module)
+
+    def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule):
+        self.add_pass(DecomposeDivPass())
+        self.add_pass(ScalarsToAttributePass())
+        return self._transform(graph_module)
@@ -0,0 +1,66 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch._ops import OpOverload
+
+
+def create_node(
+    graph: torch.fx.Graph,
+    op_target: OpOverload,
+    args: tuple = (),
+    kwargs: Optional[dict] = None,
+    quantize: bool = False,
+    q_params: Optional[tuple] = None,
+):
+    """
+    Adds a node to 'graph'. graph.inserting_before/after() should be used before the call to decide where to insert the node.
+    If quantize is true and q_params is not None, a q dq pair is inserted after the newly created node.
+    """
+
+    node = graph.create_node(
+        "call_function",
+        op_target,
+        args=args,
+        kwargs=kwargs or {},
+    )
+    if quantize and q_params:
+        return insert_q_dq_pair(graph, node, q_params)
+    return node
+
+
+def insert_q_dq_pair(
+    graph: torch.fx.Graph,
+    anchor: torch.fx.Node,
+    q_params: tuple,
+):
+    """
+    Inserts a q dq node pair after the node 'anchor'.
+    """
+
+    with graph.inserting_after(anchor):
+        q = create_node(
+            graph=graph,
+            op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(),  # We add the argument last
+        )
+        q.meta = anchor.meta
+    with graph.inserting_after(q):
+        dq = create_node(
+            graph=graph,
+            op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(q,) + q_params,
+        )
+        dq.meta = q.meta
+    anchor.replace_all_uses_with(dq)
+    # We add this last so the replace all uses above does not replace the quantized
+    # node's first use
+    q.args = (anchor,) + q_params
+    return dq
@@ -8,11 +8,9 @@
 
 from typing import cast
 
-import torch.fx
 from executorch.backends.arm.tosa_mapping import extract_tensor_meta
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+from executorch.exir.pass_base import ExportPass
 
 
 class ConvertExpandCopyToRepeatPass(ExportPass):
@@ -22,42 +20,26 @@ class ConvertExpandCopyToRepeatPass(ExportPass):
 
     expand_copy = exir_ops.edge.aten.expand_copy.default
     repeat = exir_ops.edge.aten.repeat.default
-    patterns = [{expand_copy: 1}]
 
-    def call(self, graph_module: torch.fx.GraphModule):
-        graph = graph_module.graph
-        partitions = get_source_partitions(
-            graph, [torch.expand_copy, torch.Tensor.expand, "expand"]
+    def call_operator(self, op, args, kwargs, meta):
+        if op != self.expand_copy:
+            return super().call_operator(op, args, kwargs, meta)
+
+        _, shape, _ = extract_tensor_meta(meta.data)
+        multiples = cast(list[int], args[1])
+        expanded_rank = len(multiples)
+
+        # Expanded shape is 'shape' front-padded with ones.
+        padding = expanded_rank - len(shape)
+        extended_shape = [
+            shape[i] if i >= 0 else 1 for i in range(-padding, len(shape))
+        ]
+
+        # To convert expand arg to repeat arg, non-repeated dims should have
+        # multiples[dim] = 1.
+        multiples = [
+            multiples[i] if extended_shape[i] == 1 else 1 for i in range(expanded_rank)
+        ]
+        return super().call_operator(
+            op=self.repeat, args=(args[0], multiples), kwargs=kwargs, meta=meta
         )
-        for _, src_partitions in partitions.items():
-            for src_partition in src_partitions:
-                assert len(src_partition.nodes) == 1
-
-                expand_node = src_partition.nodes[0]
-                _, shape, _ = extract_tensor_meta(expand_node.all_input_nodes[0].meta)
-                multiples = cast(tuple[int], expand_node.args[1])
-                expanded_rank = len(multiples)
-
-                # Expanded shape is 'shape' front-padded with ones.
-                padding = expanded_rank - len(shape)
-                extended_shape = [
-                    shape[i] if i >= 0 else 1 for i in range(-padding, len(shape))
-                ]
-
-                # To convert expand arg to repeat arg, non-repeated dims should have
-                # multiples[dim] = 1.
-                multiples = [
-                    multiples[i] if extended_shape[i] == 1 else 1
-                    for i in range(expanded_rank)
-                ]
-                args = (expand_node.args[0], multiples)
-
-                with graph_module.graph.inserting_before(expand_node):
-                    repeat_node = graph.create_node("call_function", self.repeat, args)
-                    repeat_node.meta = expand_node.meta
-                    for user in expand_node.users.copy():
-                        user.replace_input_with(expand_node, repeat_node)
-
-        graph.eliminate_dead_code()
-        graph_module.recompile()
-        return PassResult(graph_module, True)
@@ -7,6 +7,7 @@
 # pyre-unsafe
 
 import torch.fx
+from executorch.backends.arm.passes.arm_pass_utils import create_node
 from executorch.backends.arm.tosa_mapping import extract_tensor_meta
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -55,18 +56,18 @@ def call(self, graph_module: torch.fx.GraphModule):
                 start = end
 
             # Output nodes are of type getitem
-            # Create one slice node for each output node with matching argumetns.
+            # Replace them with one slice node for each output node.
             with graph_module.graph.inserting_before(split_node):
                 for output_node in output_nodes:
                     index = output_node.args[1]
-                    slice_node = graph.create_node(
-                        "call_function",
+                    slice_node = create_node(
+                        graph,
                         self.slice,
                         (input_node, dim, starts[index], ends[index]),
                     )
                     slice_node.meta = split_node.meta.copy()
                     slice_node.meta["val"] = slice_node.meta["val"][index]
-                    output_node.replace_input_with(split_node, slice_node)
+                    output_node.replace_all_uses_with(slice_node)
         graph.eliminate_dead_code()
         graph_module.recompile()
         return PassResult(graph_module, True)
@@ -0,0 +1,45 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+def get_div_decomposition(op) -> tuple:
+    """
+    Returns the the (reciprocal_op, mul_op), where the ops depends on if
+    the div op is in exir_ops torch.ops.aten.
+    """
+    if op == exir_ops.edge.aten.div.Tensor:
+        return (exir_ops.edge.aten.reciprocal.default, exir_ops.edge.aten.mul.Tensor)
+    if op == torch.ops.aten.div.Tensor:
+        return (torch.ops.aten.reciprocal.default, torch.ops.aten.mul.Tensor)
+    raise RuntimeError(f"Can't get div decomposition for op {op}")
+
+
+class DecomposeDivPass(ExportPass):
+    """
+    This pass decomposes div into a mul and a reciprocal node.
+
+    Example:
+        y = div(a,b)
+    Becomes:
+        x = reciprocal(b)
+        y = mul(a,x)
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in (exir_ops.edge.aten.div.Tensor, torch.ops.aten.div.Tensor):
+            return super().call_operator(op, args, kwargs, meta)
+
+        reciprocal_op, mul_op = get_div_decomposition(op)
+
+        numerator = args[0]
+        denominator = args[1]
+        reciprocal = super().call_operator(reciprocal_op, (denominator,), {}, meta)
+
+        return super().call_operator(mul_op, (numerator, reciprocal), {}, meta)
Original file line number	Diff line number	Diff line change
`@@ -15,12 +15,12 @@`
`15`	`15`	`)`
`16`	`16`
`17`	`17`	`from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer`
`18`		`-from torch._export import capture_pre_autograd_graph`
`19`	`18`	`from torch.ao.quantization.quantize_pt2e import (`
`20`	`19`	`convert_pt2e,`
`21`	`20`	`prepare_pt2e,`
`22`	`21`	`prepare_qat_pt2e,`
`23`	`22`	`)`
	`23`	`+from torch.export import export_for_training`
`24`	`24`
`25`	`25`
`26`	`26`	`class TestCoreMLQuantizer:`
`@@ -32,7 +32,7 @@ def quantize_and_compare(`
`32`	`32`	`) -> None:`
`33`	`33`	`assert quantization_type in {"PTQ", "QAT"}`
`34`	`34`
`35`		`- pre_autograd_aten_dialect = capture_pre_autograd_graph(model, example_inputs)`
	`35`	`+ pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()`
`36`	`36`
`37`	`37`	`quantization_config = LinearQuantizerConfig.from_dict(`
`38`	`38`	`{`