pytorch
diff --git a/‎.ci/scripts/unittest-linux.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/unittest-linux.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 8 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends/apple/coreml/partition/coreml_partitioner.py
Lines changed: 15 additions & 1 deletion b/‎backends/apple/coreml/partition/coreml_partitioner.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 46 additions & 0 deletions b/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 46 additions & 0 deletions
diff --git a/‎backends/arm/TARGETS
Lines changed: 4 additions & 1 deletion b/‎backends/arm/TARGETS
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 10 additions & 2 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_batchnorm2d_pass.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/fuse_batchnorm2d_pass.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/insert_rescales_pass.py
Lines changed: 109 additions & 0 deletions b/‎backends/arm/_passes/insert_rescales_pass.py
Lines changed: 109 additions & 0 deletions
diff --git a/‎backends/arm/_passes/scalars_to_attribute_pass.py
Lines changed: 12 additions & 0 deletions b/‎backends/arm/_passes/scalars_to_attribute_pass.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/__init__.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operator_support/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operator_support/bitwise_support.py
Lines changed: 33 additions & 0 deletions b/‎backends/arm/operator_support/bitwise_support.py
Lines changed: 33 additions & 0 deletions
@@ -32,7 +32,7 @@ PYTHON_EXECUTABLE=python ./examples/models/llama3_2_vision/install_requirements.
 if [[ "$BUILD_TOOL" == "cmake" ]]; then
     .ci/scripts/unittest-linux-cmake.sh
 elif [[ "$BUILD_TOOL" == "buck2" ]]; then
-    .ci/scripts/unittest-linux-buck2.sh
+    .ci/scripts/unittest-buck2.sh
 else
     echo "Unknown build tool $BUILD_TOOL"
     exit 1
 
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
        OFF
 )
 
+option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
+       OFF
+)
+
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
 
 option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -718,6 +722,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
@@ -3,7 +3,7 @@
 # Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 import logging
-from typing import List, Optional
+from typing import Callable, List, Optional, Tuple
 
 import coremltools as ct
 
@@ -104,3 +104,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        do_not_decompose = []
+        op_support = OperatorsSupportedForCoreMLBackend()
+        for node in ep.graph.nodes:
+            if (
+                node.op == "call_function"
+                and isinstance(node.target, torch._ops.OpOverload)
+                and op_support.is_node_supported(None, node)
+            ):
+                do_not_decompose.append(node.target)
+        return do_not_decompose, None
@@ -13,6 +13,7 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.exir.backend.utils import format_delegated_graph
 
 
 class TestCoreMLPartitioner(unittest.TestCase):
@@ -79,6 +80,50 @@ def test_vit_skip_conv(self):
             "getitem",
         ]
 
+    def test_ops_to_not_decompose(self):
+        class Model(torch.nn.Module):
+            def forward(self, q, k, v, mask):
+                return torch.ops.aten.scaled_dot_product_attention.default(
+                    q, k, v, attn_mask=mask
+                )
+
+        model = Model()
+        model.eval()
+
+        batch_size = 1
+        n_heads = 12
+        seq_len = 1
+        max_seq_length = 32
+        embedding_dim = 16
+        q = torch.randn(batch_size, n_heads, seq_len, embedding_dim)
+        k = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        mask = torch.randn(seq_len, max_seq_length)
+        example_inputs = (q, k, v, mask)
+        ep = torch.export.export(model, example_inputs)
+        coreml_partitioner = CoreMLPartitioner()
+
+        # Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
+        edge_program_manager = executorch.exir.to_edge_transform_and_lower(
+            ep, partitioner=[coreml_partitioner]
+        )
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            in format_delegated_graph(
+                edge_program_manager.exported_program().graph_module
+            )
+        )
+
+        # Using to_edge flow, we expect SDPA will be decomposed and not show up in delegated graph
+        edge_program_manager2 = executorch.exir.to_edge(ep)
+        edge_program_manager2.to_backend(coreml_partitioner)
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            not in format_delegated_graph(
+                edge_program_manager2.exported_program().graph_module
+            )
+        )
+
     def test_buffer(self):
         embedding_dim = 3
         max_seq_len = 2
@@ -129,4 +174,5 @@ def forward(self, q, k_val, input_pos):
     test_runner = TestCoreMLPartitioner()
     test_runner.test_add_sub_skip_mm()
     test_runner.test_vit_skip_conv()
+    test_runner.test_ops_to_not_decompose()
     test_runner.test_buffer()
@@ -4,7 +4,10 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "arm_partitioner",
     srcs = [
-        "arm_partitioner.py",
+        "ethosu_backend.py",
+        "ethosu_partitioner.py",
+        "tosa_backend.py",
+        "tosa_partitioner.py",
     ],
     typing = True,
     deps = [
 
@@ -52,6 +52,7 @@
 from executorch.backends.arm._passes.fuse_quantized_activation_pass import (  # type: ignore[import-not-found]
     FuseQuantizedActivationPass,
 )
+from executorch.backends.arm._passes.insert_rescales_pass import InsertRescalePass
 from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
     KeepDimsFalseToSqueezePass,
@@ -75,6 +76,10 @@
     UnsqueezeScalarPlaceholdersPass,
 )
 from executorch.backends.arm.tosa_specification import TosaSpecification
+
+from executorch.backends.transforms.replace_scalar_with_tensor import (
+    ReplaceScalarWithTensorArgPass,
+)
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
 from executorch.exir.pass_manager import PassManager
@@ -100,6 +105,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(ConvertFullLikeToFullPass())
 
+        self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
@@ -119,11 +125,11 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSqueezesToViewPass())
 
         self.add_pass(AnnotateChannelsLastDimOrder())
-
+        self.add_pass(InsertRescalePass())
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
-
+        self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
@@ -157,6 +163,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSqueezesToViewPass())
 
         self.add_pass(AnnotateChannelsLastDimOrder())
+        self.add_pass(InsertRescalePass())
 
         return self._transform(exported_program.graph_module)
 
@@ -173,6 +180,7 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(ScalarsToAttributePass())
+        self.add_pass(ReplaceScalarWithTensorArgPass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
 
@@ -131,6 +131,9 @@ def call(self, graph_module: GraphModule) -> PassResult:
             n = cast(Node, n)
             if n.op != "call_function":
                 continue
+            # Don't fold chains of quant-ops into each other.
+            if n.target in (q_op, dq_op):
+                continue
 
             # Make sure we haven't already set qparams meta information on the node
             assert "input_qparams" not in n.meta.keys()
 
@@ -114,6 +114,7 @@ def try_set_param(
             if not try_set_param(conv_bias_node, fused_conv_bias) and try_set_param(
                 bn_bias_node, fused_conv_bias
             ):
+                # pyre-ignore[60]
                 # Conv didn't have bias but batchnorm did, steal bias from batchnorm.
                 conv_args = (*conv.args[0:2], bn_bias_node, *conv.args[3:])
                 conv.args = conv_args
 
@@ -0,0 +1,109 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from copy import copy
+from typing import cast
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op, QuantArgs
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch import Tensor
+from torch.fx import GraphModule, Node
+from torch.library import custom_op, register_fake
+
+logger = logging.getLogger(__name__)
+
+
+@custom_op("tosa::_rescale", mutates_args=())  # type: ignore[misc]
+def rescale(
+    x: Tensor, dtype: torch.dtype, scale: float, in_zp: int, out_zp: int
+) -> Tensor:
+    logger.warning(
+        "Ran default implementation of tosa::_rescale."
+        "This op is meant to always be inserted inside a partition and a correct default implementation is not implemented."
+    )
+    # Clone is needed to not return reference when rescaling to same dtype.
+    # This is a neccessary requirement for non-mutating custom ops.
+    return x.to(dtype=dtype).clone()
+
+
+@register_fake("tosa::_rescale")  # type: ignore[misc]
+def rescale_fake(
+    x: Tensor, dtype: torch.dtype, scale: float, in_zp: int, out_zp: int
+) -> Tensor:
+    """Casts the input tensor to dtype `dtype` to produce the correct tensor meta for a _rescale op.
+    Additionally validates TOSA constraints of a RESCALE op.
+    """
+    if not (dtype == torch.int32 or dtype == torch.int8):
+        raise NotImplementedError(
+            "tosa::rescale currently only supports int32 and int8."
+        )
+    if dtype == torch.int32 and out_zp != 0:
+        raise ValueError(
+            "TOSA requires output_zp to be zero when the output dtype is int32."
+        )
+    if x.dtype == torch.int32 and in_zp != 0:
+        raise ValueError(
+            "TOSA requires input_zp to be zero when the input dtype is int32."
+        )
+    if x.dtype == torch.int8 and not -128 <= in_zp <= 127:
+        raise ValueError(f"{in_zp=} outside valid range (-128,127) for int8.")
+    if dtype == torch.int8 and not -128 <= out_zp <= 127:
+        raise ValueError(f"{out_zp=} outside valid range (-128,127) for int8.")
+
+    return x.to(dtype=dtype).clone()
+
+
+class InsertRescalePass(ExportPass):
+    """Finds patterns of dq -> q, and replaces them
+    with passthrough_to_tosa::rescales.
+
+    Does not garantuee that the dtypes and zero points are valid
+    in TOSA, that is the job of the quantization annotator that
+    produced the dq and q nodes. The TOSA constraints are validated
+    in the fake implementation of passthrough_to_tosa:rescale.
+    """
+
+    def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule):
+        dq_args = QuantArgs.from_operator(node.target, node.args)
+        q_args = QuantArgs.from_operator(user.target, user.args)
+        new_scale = dq_args.scale / q_args.scale
+
+        with graph_module.graph.inserting_before(node):
+            rescale_node = create_node(
+                graph_module.graph,
+                torch.ops.tosa._rescale.default,
+                (
+                    node.all_input_nodes[0],
+                    q_args.dtype,
+                    new_scale,
+                    dq_args.zp,
+                    q_args.zp,
+                ),
+            )
+            rescale_node.meta = copy(user.meta)
+            user.replace_all_uses_with(rescale_node)
+            graph_module.graph.erase_node(user)
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        modified = False
+        for node in graph_module.graph.nodes:
+            node = cast(Node, node)
+
+            if node.target is not dq_op:
+                continue
+            # Copy users since we remove them while iterating, modyfing the node.users list.
+            for user in copy(node.users):
+                if user.target is q_op:
+                    self.fold_dq_q_to_rescale(node, user, graph_module)
+                    modified = True
+            if len(node.users) == 0:
+                graph_module.graph.erase_node(node)
+
+        graph_module = super().call(graph_module).graph_module
+        graph_module.recompile()
+        return PassResult(graph_module, modified)
@@ -76,5 +76,17 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     new_args.append(get_attr_node)
             n.args = tuple(new_args)
 
+            # Replace rsub.Scalar with sub.Tensor as retracing will fail otherwise
+            if n.target == torch.ops.aten.rsub.Scalar:
+                with graph_module.graph.inserting_after(n):
+                    reversed_args = (n.args[1], n.args[0])
+                    sub = graph_module.graph.create_node(
+                        "call_function", torch.ops.aten.sub.Tensor, reversed_args, {}
+                    )
+                    n.replace_all_uses_with(sub)
+                    sub.meta["val"] = n.meta["val"]
+                graph_module.graph.erase_node(n)
+
         graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
@@ -6,6 +6,7 @@
 # pyre-unsafe
 
 from . import (  # noqa
+    bitwise_support,
     convolution_support,
     pool_2d_support,
     reduce_sum_support,
 
@@ -0,0 +1,33 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.fx as fx
+from executorch.backends.arm.operator_support.tosa_supported_operators import (
+    register_tosa_support_check,
+    SupportedTOSAOperatorCheck,
+)
+from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@register_tosa_support_check
+class BitwiseSupported(SupportedTOSAOperatorCheck):
+    targets = [
+        exir_ops.edge.aten.bitwise_and.Tensor,
+        exir_ops.edge.aten.bitwise_or.Tensor,
+        exir_ops.edge.aten.bitwise_xor.Tensor,
+    ]
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+BI"),
+        TosaSpecification.create_from_string("TOSA-0.80+MI"),
+    ]
+
+    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+        # U55 case, Vela 4.2.0 (25.02 release)
+        if isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset:
+            return False
+
+        return True