pytorch
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 9 additions & 5 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 9 additions & 5 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/convert_split_to_slice.py
Lines changed: 6 additions & 5 deletions b/‎backends/arm/_passes/convert_split_to_slice.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎backends/arm/_passes/replace_inf_values_pass.py
Lines changed: 45 additions & 0 deletions b/‎backends/arm/_passes/replace_inf_values_pass.py
Lines changed: 45 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/slice_copy_support.py
Lines changed: 2 additions & 3 deletions b/‎backends/arm/operator_support/slice_copy_support.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/arm/operators/op_rescale.py
Lines changed: 7 additions & 7 deletions b/‎backends/arm/operators/op_rescale.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎backends/arm/process_node.py
Lines changed: 12 additions & 8 deletions b/‎backends/arm/process_node.py
Lines changed: 12 additions & 8 deletions
diff --git a/‎backends/arm/quantizer/quantization_annotator.py
Lines changed: 4 additions & 0 deletions b/‎backends/arm/quantizer/quantization_annotator.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/test/models/test_llama.py
Lines changed: 2 additions & 3 deletions b/‎backends/arm/test/models/test_llama.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/arm/test/ops/test_bmm.py
Lines changed: 0 additions & 19 deletions b/‎backends/arm/test/ops/test_bmm.py
Lines changed: 0 additions & 19 deletions
@@ -608,7 +608,7 @@ endif()
 # any backends.
 #
 add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PUBLIC executorch_core)
+target_link_libraries(executorch PRIVATE executorch_core)
 target_include_directories(executorch PUBLIC ${_common_include_directories})
 target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch PUBLIC ${_common_compile_options})
 
@@ -57,4 +57,5 @@
 from .size_adjust_conv2d_pass import SizeAdjustConv2DPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 from .unsqueeze_scalar_placeholders_pass import UnsqueezeScalarPlaceholdersPass  # noqa
+from .replace_inf_values_pass import ReplaceInfValues  # noqa  # usort: skip
 from .arm_pass_manager import ArmPassManager  # noqa  # usort: skip
@@ -70,17 +70,14 @@ def call(self, graph_module: GraphModule) -> PassResult:
             if quantized_input:
                 matmul_args = matmul_node.all_input_nodes
                 for node in matmul_args:
+                    # Find the dq-node connected to this mm/bmm arg
                     input_node = self._match_partition_to_node(
                         node, partition.input_nodes
                     )
-
-                    # Remove partition input dq-node
-                    input_node.replace_all_uses_with(input_node.all_input_nodes[0])
-                    graph_module.graph.erase_node(input_node)
                     input_node_qargs = QuantArgs.from_operator(
                         input_node.target, input_node.args
                     )
-
+                    # Insert new dq-node just before the mm/bmm with input_node's qparams
                     with graph_module.graph.inserting_before(matmul_node):
                         # Create new dq-node before matmul
                         dq_node = create_node(
@@ -90,6 +87,13 @@ def call(self, graph_module: GraphModule) -> PassResult:
                         dq_node.args = (node, *input_node_qargs)
                         matmul_node.replace_input_with(node, dq_node)
 
+                for partition_input in partition.input_nodes:
+                    # Remove partition input dq-node
+                    partition_input.replace_all_uses_with(
+                        partition_input.all_input_nodes[0]
+                    )
+                    graph_module.graph.erase_node(partition_input)
+
             partition_output = list(partition.output_nodes[0].users)[0]
             quantized_output = partition_output.target == q_op
             if quantized_output:
 
@@ -49,6 +49,7 @@
     MatchWhereSelfDtypePass,
     QuantizeOperatorArguments,
     RemoveClonePass,
+    ReplaceInfValues,
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
     RetraceFoldedDtypesPass,
@@ -216,4 +217,5 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
             self.add_pass(DecomposeSoftmaxPass())
 
         self.add_pass(ConvertMinMaxPass())
+        self.add_pass(ReplaceInfValues())
         return self._transform(graph_module)
@@ -1,14 +1,15 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
 import torch.fx
-from executorch.backends.arm._passes.arm_pass_utils import create_node
-from executorch.backends.arm.tosa_mapping import extract_tensor_meta
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -34,7 +35,7 @@ def call(self, graph_module: torch.fx.GraphModule):
             split_node = node
             input_node = split_node.all_input_nodes[0]
             output_nodes = split_node.users.copy()
-            _, shape, _ = extract_tensor_meta(input_node.meta)
+            shape = get_first_fake_tensor(input_node).shape
             rank = len(shape)
             split_lengths = split_node.args[1]
             dim = split_node.args[2] if len(split_node.args) > 2 else 0
 
@@ -0,0 +1,45 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This pass is based on backends/qualcomm/_passes/replace_inf_values.py
+# with some modification to replaced inf values.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ReplaceInfValues(ExportPass):
+    """
+    Due to limitation in Quantizer, we need to change inf/-inf to more quantizable values.
+    """
+
+    def __init__(self):
+        super(ReplaceInfValues, self).__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        for buf_name, tensor in graph_module.named_buffers():
+            if tensor.is_floating_point():
+                modified = True
+                # 255 here is mainly for attention_mask in Llama for reasonable quant scale
+                tensor[tensor == float("inf")] = 255
+                tensor[tensor == float("-inf")] = -255
+                setattr(graph_module, buf_name, tensor)
+
+        for node in graph_module.graph.nodes:
+            arg_list = list(node.args)
+            for index, arg in enumerate(arg_list):
+                if arg == float("-inf"):
+                    modified = True
+                    arg_list[index] = -255
+                elif arg == float("inf"):
+                    modified = True
+                    arg_list[index] = +255
+            node.args = tuple(arg_list)
+
+        if modified:
+            graph_module.recompile()
+        return PassResult(graph_module, modified)
@@ -12,7 +12,6 @@
     SupportedTOSAOperatorCheck,
 )
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.arm.tosa_utils import getNodeArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 
 logger = logging.getLogger(__name__)
@@ -33,8 +32,8 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification) ->
         if tosa_spec not in self.tosa_specs:
             return False
 
-        inputs = getNodeArgs(node)
-        if len(inputs) == 5 and (step := inputs[4].number) != 1:
+        args = node.args
+        if len(args) == 5 and (step := args[4]) != 1:
             logging.warning(f"{node.target} with step size of {step} not supported.")
             return False
         return True
@@ -13,7 +13,7 @@
     NodeVisitor,
     register_node_visitor,
 )
-from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg
+from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_quant_utils import create_const_ops_for_rescale
 
 from executorch.backends.arm.tosa_specification import TosaSpecification
@@ -35,15 +35,15 @@ def define_node(
     ) -> None:
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        input_dtype = inputs[0].dtype
+        input_dtype = node.all_input_nodes[0].meta["val"].dtype
         output_dtype = cast(torch.dtype, node.args[1])
         scale = cast(float, node.args[2])
         input_zp = cast(int, node.args[3])
         output_zp = cast(int, node.args[4])
 
-        if input_dtype != map_dtype(torch.int8) and input_zp != 0:
+        if input_dtype != torch.int8 and input_zp != 0:
             raise ValueError(
-                f"If input dtype is not int8, input_zp must be 0. Got input_dtype{ts.DTypeNames[input_dtype]}, {input_zp=}"
+                f"If input dtype is not int8, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}"
             )
         if output_dtype != torch.int8 and output_zp != 0:
             raise ValueError(
@@ -91,15 +91,15 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
         from tosa.RoundingMode import RoundingMode  # type: ignore
 
-        input_dtype = inputs[0].dtype
+        input_dtype = node.all_input_nodes[0].meta["val"].dtype
         output_dtype = cast(torch.dtype, node.args[1])
         scale = cast(float, node.args[2])
         input_zp = cast(int, node.args[3])
         output_zp = cast(int, node.args[4])
 
-        if input_dtype != map_dtype(torch.int8) and input_zp != 0:
+        if input_dtype != torch.int8 and input_zp != 0:
             raise ValueError(
-                f"If input dtype is not int8, input_zp must be 0. Got input_dtype{ts.DTypeNames[input_dtype]}, {input_zp=}"
+                f"If input dtype is not int8, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}"
             )
         if output_dtype != torch.int8 and output_zp != 0:
             raise ValueError(
 
@@ -36,11 +36,11 @@ def process_call_function(
     tosa_spec: TosaSpecification,
 ):
     # Unpack arguments and convert
-    inputs = getNodeArgs(node)
+    inputs = getNodeArgs(node, tosa_spec)
 
     # Convert output (this node itself)
     try:
-        output = TosaArg(node)
+        output = TosaArg(node, tosa_spec)
     except ValueError as e:
         raise ValueError(
             f"Failed processing call_function: {node.name}. "
@@ -78,7 +78,7 @@ def process_inputs(
             f"Expected dim_order: {tuple(range(meta.dim()))}, but got: {meta.dim_order()} for node {node.name}"
         )
     try:
-        tosa_arg = TosaArg(node)
+        tosa_arg = TosaArg(node, tosa_spec)
     except ValueError as e:
         raise ValueError(
             f"Failed processing input placeholder: {node.name}. "
@@ -112,7 +112,7 @@ def process_inputs_to_parameters(
 ):
     """Serialize bias and non-quantized weights"""
     try:
-        tosa_arg = TosaArg(node)
+        tosa_arg = TosaArg(node, tosa_spec)
     except ValueError as e:
         raise ValueError(
             f"Failed processing parameter placeholder: {node.name}. "
@@ -137,10 +137,11 @@ def process_inputs_to_buffers(
     node: torch.fx.Node,
     tosa_graph: Any,
     edge_program: ExportedProgram,
+    tosa_spec: TosaSpecification,
 ):
     """Serialize quantized weights"""
     try:
-        tosa_arg = TosaArg(node)
+        tosa_arg = TosaArg(node, tosa_spec)
     except ValueError as e:
         raise ValueError(
             f"Failed processing buffer placeholder: {node.name}. "
@@ -165,9 +166,10 @@ def process_inputs_to_lifted_tensor_constants(
     node: torch.fx.Node,
     tosa_graph: Any,
     edge_program: ExportedProgram,
+    tosa_spec: TosaSpecification,
 ):
     try:
-        tosa_arg = TosaArg(node)
+        tosa_arg = TosaArg(node, tosa_spec)
     except ValueError as e:
         raise ValueError(
             f"Failed processing lifted tensor constant placeholder: {node.name}. "
@@ -196,9 +198,11 @@ def process_placeholder(
     elif is_param(edge_program, node):
         process_inputs_to_parameters(node, tosa_graph, edge_program, tosa_spec)
     elif is_buffer(edge_program, node):
-        process_inputs_to_buffers(node, tosa_graph, edge_program)
+        process_inputs_to_buffers(node, tosa_graph, edge_program, tosa_spec)
     elif is_lifted_tensor_constant(edge_program, node):
-        process_inputs_to_lifted_tensor_constants(node, tosa_graph, edge_program)
+        process_inputs_to_lifted_tensor_constants(
+            node, tosa_graph, edge_program, tosa_spec
+        )
     elif node.name in edge_program.graph_signature.inputs_to_lifted_custom_objs:
         raise NotImplementedError(
             "Placeholder is of type 'lifted custom object' which is not supported."
 
@@ -411,6 +411,9 @@ def any_or_hardtanh_min_zero(n: Node):
         shared_qspec = SharedQuantizationSpec(node.args[0])
         quant_properties.quant_inputs = [_QuantProperty(0, shared_qspec)]  # type: ignore[arg-type]
         quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
+    elif node.target in [torch.ops.aten.scalar_tensor.default]:
+        quant_properties.quant_inputs = []
+        quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
     else:
         return None
 
@@ -458,5 +461,6 @@ def annotate_graph(  # type: ignore[return]
         if node.target in [
             torch.ops.aten.full_like.default,
             torch.ops.aten.full.default,
+            torch.ops.aten.scalar_tensor.default,
         ]:
             node.kwargs = {}
@@ -105,7 +105,6 @@ def test_llama_tosa_MI(self):
                 )
             )
 
-    @pytest.mark.xfail(reason="KeyError: scalar_tensor_1 (MLETORCH-907)")
     def test_llama_tosa_BI(self):
         llama_model, llama_inputs, llama_meta = self.prepare_model()
 
@@ -126,7 +125,7 @@ def test_llama_tosa_BI(self):
                 .to_executorch()
                 .run_method_and_compare_outputs(
                     inputs=llama_inputs,
-                    atol=4.3,
-                    rtol=1.1,  # TODO: Tolerance needs to be updated after MLETORCH-907
+                    atol=9.9,
+                    rtol=1.5,  # TODO: Tolerance needs to be updated after MLETORCH-907
                 )
             )
@@ -32,15 +32,6 @@ class BMM(torch.nn.Module):
         def forward(self, x, y):
             return torch.bmm(x, y)
 
-    class MatMul(torch.nn.Module):
-        test_data_generators = [
-            lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
-            lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
-        ]
-
-        def forward(self, x, y):
-            return torch.matmul(x, y)
-
     class BMMSingleInput(torch.nn.Module):
         test_data_generators = [
             lambda: (torch.rand(20, 3, 3),),
@@ -129,16 +120,6 @@ def test_bmm_single_input_tosa_MI(self, test_data_generator: Callable[[], Tuple]
         test_data = test_data_generator()
         self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data)
 
-    @parameterized.expand(MatMul.test_data_generators)
-    def test_matmul_tosa_MI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_tosa_MI_pipeline(self.MatMul(), test_data)
-
-    @parameterized.expand(MatMul.test_data_generators)
-    def test_matmul_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
-        test_data = test_data_generator()
-        self._test_bmm_tosa_BI_pipeline(self.MatMul(), test_data)
-
     @parameterized.expand(BMM.test_data_generators)
     def test_bmm_tosa_BI(self, test_data_generator: Callable[[], Tuple]):
         test_data = test_data_generator()
Original file line number	Diff line number	Diff line change
`@@ -608,7 +608,7 @@ endif()`
`608`	`608`	`# any backends.`
`609`	`609`	`#`
`610`	`610`	`add_library(executorch ${_executorch__srcs})`
`611`		`-target_link_libraries(executorch PUBLIC executorch_core)`
	`611`	`+target_link_libraries(executorch PRIVATE executorch_core)`
`612`	`612`	`target_include_directories(executorch PUBLIC ${_common_include_directories})`
`613`	`613`	`target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)`
`614`	`614`	`target_compile_options(executorch PUBLIC ${_common_compile_options})`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,6 @@ def test_llama_tosa_MI(self):`
`105`	`105`	`)`
`106`	`106`	`)`
`107`	`107`
`108`		`- @pytest.mark.xfail(reason="KeyError: scalar_tensor_1 (MLETORCH-907)")`
`109`	`108`	`def test_llama_tosa_BI(self):`
`110`	`109`	`llama_model, llama_inputs, llama_meta = self.prepare_model()`
`111`	`110`
`@@ -126,7 +125,7 @@ def test_llama_tosa_BI(self):`
`126`	`125`	`.to_executorch()`
`127`	`126`	`.run_method_and_compare_outputs(`
`128`	`127`	`inputs=llama_inputs,`
`129`		`- atol=4.3,`
`130`		`- rtol=1.1, # TODO: Tolerance needs to be updated after MLETORCH-907`
	`128`	`+ atol=9.9,`
	`129`	`+ rtol=1.5, # TODO: Tolerance needs to be updated after MLETORCH-907`
`131`	`130`	`)`
`132`	`131`	`)`