pytorch
diff --git a/‎.ci/scripts/wheel/pre_build_script.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/wheel/pre_build_script.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎CODEOWNERS
Lines changed: 1 addition & 1 deletion b/‎CODEOWNERS
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 10 additions & 1 deletion b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 10 additions & 1 deletion
diff --git a/‎backends/arm/_passes/decompose_avg_pool2d.py
Lines changed: 121 additions & 0 deletions b/‎backends/arm/_passes/decompose_avg_pool2d.py
Lines changed: 121 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_grouped_conv.py
Lines changed: 134 additions & 0 deletions b/‎backends/arm/_passes/decompose_grouped_conv.py
Lines changed: 134 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_maxpool2d_with_dilation.py
Lines changed: 4 additions & 6 deletions b/‎backends/arm/_passes/decompose_maxpool2d_with_dilation.py
Lines changed: 4 additions & 6 deletions
@@ -14,4 +14,4 @@ set -euxo pipefail
 # which does install them. Though we'd need to disable build isolation to be
 # able to see the installed torch package.
 
-"${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh"
+"${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh"  --example
@@ -48,7 +48,7 @@
 /extension/flat_tensor @lucylq
 /extension/gguf_util @larryliu0820
 /extension/kernel_util @kimishpatel @manuelcandales @swolchok
-/extension/llm @jackzhxng @larryliu0820 @swolchok
+/extension/llm @jackzhxng @larryliu0820 @swolchok @mergennachin
 /extension/memory_allocator @JacobSzwejbka @swolchok
 /extension/module @shoumikhin
 /extension/parallel @kimishpatel @swolchok
 
@@ -20,10 +20,12 @@
 from .convert_split_to_slice import ConvertSplitToSlicePass  # noqa
 from .convert_squeezes_to_view import ConvertSqueezesToViewPass  # noqa
 from .convert_to_clamp import ConvertToClampPass  # noqa
+from .decompose_avg_pool2d import DecomposeAvgPool2d  # noqa
 from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
+from .decompose_grouped_conv import DecomposeGroupedConv  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
@@ -32,6 +34,7 @@
 from .decompose_maxpool2d_with_dilation import DecomposeMaxPool2DPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
+from .decompose_round_pass import DecomposeRoundPass  # noqa
 from .decompose_select import DecomposeSelectPass  # noqa
 from .decompose_silu_pass import DecomposeSiluPass  # noqa
 from .decompose_softmax_pass import DecomposeSoftmaxPass  # noqa
 
@@ -23,10 +23,12 @@
     ConvertSplitToSlicePass,
     ConvertSqueezesToViewPass,
     ConvertToClampPass,
+    DecomposeAvgPool2d,
     DecomposeCosineSimilarityPass,
     DecomposeDivPass,
     DecomposeEmbeddingPass,
     DecomposeGeluPass,
+    DecomposeGroupedConv,
     DecomposeGroupNormPass,
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
@@ -35,6 +37,7 @@
     DecomposeMaxPool2DPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
+    DecomposeRoundPass,
     DecomposeSelectPass,
     DecomposeSiluPass,
     DecomposeSoftmaxPass,
@@ -63,7 +66,6 @@
     UnsqueezeBeforeRepeatPass,
     UnsqueezeScalarPlaceholdersPass,
 )
-
 from executorch.backends.arm.tosa_specification import (
     TosaLoweringContext,
     TosaSpecification,
@@ -115,8 +117,10 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         if self.tosa_spec.is_U55_subset:
             self.add_pass(BroadcastArgsPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeAvgPool2d())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
+        self.add_pass(DecomposeGroupedConv())
         self.add_pass(RemoveClonePass())
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
@@ -139,6 +143,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+        self.add_pass(DecomposeRoundPass())
         self.add_pass(DecomposeSqrtPass())
         self.add_pass(ConvertIntPowToMuls())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
@@ -172,8 +177,10 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
+        self.add_pass(DecomposeAvgPool2d())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
+        self.add_pass(DecomposeGroupedConv())
         self.add_pass(RemoveClonePass())
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
@@ -219,6 +226,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(InsertCastForOpsWithInt64InputPass())
         self.add_pass(DecomposeEmbeddingPass())
         self.add_pass(DecomposeScaledDotProductAttention())
+        self.add_pass(DecomposeRoundPass())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeGroupNormPass())
@@ -232,6 +240,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(DecomposeSqrtPass())
         self.add_pass(DecomposeSiluPass())
+        self.add_pass(DecomposeAvgPool2d())
 
         if self.tosa_spec.is_U55_subset:
             # Numerically stable softmax uses amax which is not supported on Ethos-U55
 
@@ -0,0 +1,121 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm.operators.operator_validation_utils import (
+    adjust_pooling_pad_if_needed,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+edge_div_ops = (exir_ops.edge.aten.avg_pool2d.default,)
+aten_div_ops = (torch.ops.aten.avg_pool2d.default,)
+
+
+def get_decomposition(op) -> tuple:
+    if op in edge_div_ops:
+        return (
+            exir_ops.edge.aten.full.default,
+            exir_ops.edge.aten.cat.default,
+            exir_ops.edge.aten.avg_pool2d.default,
+            exir_ops.edge.aten.mul.Tensor,
+        )
+    if op in aten_div_ops:
+        return (
+            torch.ops.aten.full.default,
+            torch.ops.aten.cat.default,
+            torch.ops.aten.avg_pool2d.default,
+            torch.ops.aten.mul.Tensor,
+        )
+    raise RuntimeError(f"Can't get div decomposition for op {op}")
+
+
+class DecomposeAvgPool2d(ExportPass):
+    """ """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in (edge_div_ops + aten_div_ops):
+            return super().call_operator(op, args, kwargs, meta)
+
+        full_op, cat_op, avgpool_op, mul_op = get_decomposition(op)
+
+        x = args[0]
+        kernel_h, kernel_w = args[1]
+        kernel_size = kernel_h * kernel_w
+        stride_h, stride_w = args[2]
+        pad_h, pad_w = new_pad_h, new_pad_w = args[3] if len(args) > 3 else (0, 0)
+        ceil_mode = args[4] if len(args) > 4 else False
+        count_include_pad = args[5] if len(args) > 5 else True
+        divisor_override = args[6] if len(args) > 6 else None
+
+        n, c, h, w = x.data.shape
+        post_pad_w, post_pad_h = (0, 0)
+
+        # Count_include_pad == False means that we use a different divisor for edge elements
+        # When divisor_override is set, this will be overriden anyways.
+        # It is easier to replace a constant divisor, so set count_include_pad == True
+        if divisor_override is not None:
+            count_include_pad = True
+
+        # Add width padding manually if count_include_pad
+        if count_include_pad and pad_w > 0:
+            pre_pad_shape = [n, c, h, pad_w]
+            pre_pad = super().call_operator(full_op, (pre_pad_shape, 0.0), kwargs, meta)
+
+            if ceil_mode and divisor_override is None:
+                post_pad_w = pad_w
+            else:
+                post_pad_w = adjust_pooling_pad_if_needed(
+                    w, kernel_w, stride_w, pad_w, ceil_mode
+                )
+
+            if post_pad_w > 0:
+                post_pad_shape = [n, c, h, post_pad_w]
+                post_pad = super().call_operator(
+                    full_op, (post_pad_shape, 0.0), kwargs, meta
+                )
+                cat_nodes = [pre_pad, x, post_pad]
+            else:
+                cat_nodes = [pre_pad, x]
+
+            x = super().call_operator(cat_op, (cat_nodes, 3), kwargs, meta)
+            new_pad_w = 0
+
+        # Add height padding manually if count_include_pad
+        if count_include_pad and pad_h > 0:
+            pre_pad_shape = [n, c, pad_h, w + pad_w + post_pad_w]
+            pre_pad = super().call_operator(full_op, (pre_pad_shape, 0.0), kwargs, meta)
+
+            if ceil_mode and divisor_override is None:
+                post_pad_h = pad_h
+            else:
+                post_pad_h = adjust_pooling_pad_if_needed(
+                    h, kernel_h, stride_h, pad_h, ceil_mode
+                )
+
+            if post_pad_h > 0:
+                post_pad_shape = [n, c, post_pad_h, w + pad_w + post_pad_w]
+                post_pad = super().call_operator(
+                    full_op, (post_pad_shape, 0.0), kwargs, meta
+                )
+                cat_nodes = [pre_pad, x, post_pad]
+            else:
+                cat_nodes = [pre_pad, x]
+
+            x = super().call_operator(cat_op, (cat_nodes, 2), kwargs, meta)
+            new_pad_h = 0
+
+        avgpool_args = (x, args[1], args[2], [new_pad_h, new_pad_w], ceil_mode, False)
+        x = super().call_operator(avgpool_op, avgpool_args, kwargs, meta)
+
+        # Multiply by factor (kernel_size / divisor_override) if divisor_override
+        if divisor_override is not None and divisor_override != kernel_size:
+            override_multiplier = super().call_operator(
+                full_op, ([1, 1, 1, 1], kernel_size / divisor_override), kwargs, meta
+            )
+            x = super().call_operator(mul_op, (x, override_multiplier), kwargs, meta)
+
+        return x
@@ -0,0 +1,134 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from copy import copy
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class DecomposeGroupedConv(ExportPass):
+    """
+    Splits a grouped convolution which is not supported by TOSA into multiple
+    convolutions using slice->conv->cat.
+
+    Before pass:
+        x = conv(input, weight, bias, groups = 2)
+
+    After pass:
+        input1 = slice(input)
+        weight1 = slice(weight)
+        bias1 = slice(bias)
+        x1 = conv(input1, weight1, bias1)
+
+        input2 = slice(input)
+        weight2 = slice(weight)
+        bias2 = slice(bias)
+        x2 = conv(input2, weight2, bias2)
+
+        x = cat(x1, x2)
+    """
+
+    @staticmethod
+    def _get_decomposition(op):
+        match op:
+            case exir_ops.edge.aten.convolution.default:
+                return (
+                    exir_ops.edge.aten.slice_copy.Tensor,
+                    exir_ops.edge.aten.convolution.default,
+                    exir_ops.edge.aten.cat.default,
+                )
+            case torch.ops.aten.conv2d.default:
+                return (
+                    torch.ops.aten.slice_copy.Tensor,
+                    torch.ops.aten.conv2d.default,
+                    torch.ops.aten.cat.default,
+                )
+            case _:
+                raise RuntimeError("Unvalid op for grouped conv decomposition.")
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op == exir_ops.edge.aten.convolution.default:
+            groups = args[8]
+            transposed = args[6]
+        elif op == torch.ops.aten.conv2d.default:
+            groups = args[6]
+            transposed = False
+        else:
+            return super().call_operator(op, args, kwargs, meta)
+
+        if groups == 1 or transposed:
+            return super().call_operator(op, args, kwargs, meta)
+
+        input_node = args[0]
+        if input_node.data.shape[1] == groups:
+            # This is a depthwise convolution which is handled elsewhere
+            return super().call_operator(op, args, kwargs, meta)
+
+        weight_node = args[1]
+        bias_node = args[2]
+
+        input_slice_size = weight_node.data.shape[1]
+        output_slice_size = weight_node.data.shape[0] // groups
+
+        no_q_dq_meta = copy(meta)
+        no_q_dq_meta.data = {}
+        no_q_dq_meta.data = {}
+
+        slice_op, conv_op, cat_op = DecomposeGroupedConv._get_decomposition(op)
+
+        input_slices = []
+        for i in range(groups):
+            start_index = i * input_slice_size
+            stop_index = (i + 1) * input_slice_size
+            slice_args = (input_node, 1, start_index, stop_index)
+
+            input_slices.append(
+                super().call_operator(slice_op, slice_args, kwargs, no_q_dq_meta)
+            )
+
+        filter_slices = []
+        for i in range(groups):
+            start_index = i * output_slice_size
+            stop_index = (i + 1) * output_slice_size
+            slice_args = (weight_node, 0, start_index, stop_index)
+
+            filter_slices.append(
+                super().call_operator(slice_op, slice_args, kwargs, no_q_dq_meta)
+            )
+
+        bias_slices = []
+        for i in range(groups):
+            if bias_node is None:
+                bias_slices.append(None)
+            else:
+
+                start_index = i * output_slice_size
+                stop_index = (i + 1) * output_slice_size
+                slice_args = (bias_node, 0, start_index, stop_index)
+
+                bias_slices.append(
+                    super().call_operator(slice_op, slice_args, kwargs, no_q_dq_meta)
+                )
+
+        output_slices = []
+        for input_slice, filter_slice, bias_slice in zip(
+            input_slices, filter_slices, bias_slices
+        ):
+
+            if op == exir_ops.edge.aten.convolution.default:
+                conv_args = (input_slice, filter_slice, bias_slice, *args[3:8], 1)
+            elif op == torch.ops.aten.conv2d.default:
+                conv_args = (input_slice, filter_slice, bias_slice, *args[3:6], 1)
+            else:
+                raise RuntimeError("Unvalid op for grouped conv decomposition.")
+
+            output_slices.append(
+                super().call_operator(conv_op, conv_args, kwargs, meta)
+            )
+
+        cat_args = (output_slices, 1)
+        return super().call_operator(cat_op, cat_args, kwargs, no_q_dq_meta)
@@ -36,6 +36,7 @@ def call_operator(self, op, args, kwargs, meta):
         stride = args[2]
         padding = args[3] if len(args) >= 4 else 0
         dilation = args[4] if len(args) >= 5 else 1
+        ceil_mode = args[5] if len(args) == 6 else False
 
         # Normalize attributes
         pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
@@ -45,12 +46,9 @@ def call_operator(self, op, args, kwargs, meta):
         )
         s_h, s_w = (stride, stride) if isinstance(stride, int) else stride
 
-        # If no dilation: call EXIR edge op with only supported args (x, kernel, stride[, padding])
+        # If no dilation: call EXIR edge op
         if d_h == 1 and d_w == 1:
-            minimal_args = [x, kernel_size, stride]
-            # only include padding if non-zero
-            if (pad_h, pad_w) != (0, 0):
-                minimal_args.append((pad_h, pad_w))
+            minimal_args = [x, kernel_size, stride, padding, dilation, ceil_mode]
             return super().call_operator(op, tuple(minimal_args), {}, meta)
 
         # Compute padded and packed dimensions for dilation > 1
@@ -102,7 +100,7 @@ def call_operator(self, op, args, kwargs, meta):
             if is_with_indices
             else exir_ops.edge.aten.max_pool2d.default
         )
-        pool_args = (x2, (k_h, k_w), (s_h, s_w), (0, 0))
+        pool_args = (x2, (k_h, k_w), (s_h, s_w), (0, 0), 1, ceil_mode)
         pool_out = super().call_operator(
             pool_edge_op,
             pool_args,