pytorch
diff --git a/‎.github/scripts/propose_ghstack_orig_pr.py
Lines changed: 16 additions & 9 deletions b/‎.github/scripts/propose_ghstack_orig_pr.py
Lines changed: 16 additions & 9 deletions
diff --git a/‎.github/workflows/android.yml renamed to ‎.github/workflows/_android.yml
Lines changed: 1 addition & 20 deletions b/‎.github/workflows/android.yml renamed to ‎.github/workflows/_android.yml
Lines changed: 1 addition & 20 deletions
diff --git a/‎.github/workflows/ghstack_land.yml
Lines changed: 2 additions & 3 deletions b/‎.github/workflows/ghstack_land.yml
Lines changed: 2 additions & 3 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/pull.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 75 additions & 9 deletions b/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 75 additions & 9 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 11 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 11 additions & 0 deletions
@@ -26,9 +26,9 @@ def parse_args():
         required=True,
     )
     parser.add_argument(
-        "--pr",
-        type=int,
-        help="Number of the PR in the stack to check and create corresponding PR",
+        "--ref",
+        type=str,
+        help="Ref fo PR in the stack to check and create corresponding PR",
         required=True,
     )
     return parser.parse_args()
@@ -68,12 +68,18 @@ def extract_stack_from_body(pr_body: str) -> List[int]:
     return list(reversed(prs))
 
 
-def get_pr_stack_from_number(pr_number: int, repo: Repository) -> List[int]:
+def get_pr_stack_from_number(ref: str, repo: Repository) -> List[int]:
+    if ref.isnumeric():
+        pr_number = int(ref)
+    else:
+        branch_name = ref.replace("refs/heads/", "")
+        pr_number = repo.get_branch(branch_name).commit.get_pulls()[0].number
+
     pr_stack = extract_stack_from_body(repo.get_pull(pr_number).body)
 
     if not pr_stack:
         raise Exception(
-            f"Could not find PR stack in body of #{pr_number}. "
+            f"Could not find PR stack in body of ref. "
             + "Please make sure that the PR was created with ghstack."
         )
 
@@ -100,14 +106,15 @@ def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository):
 ghstack PR base: https://github.com/pytorch/executorch/tree/{pr.base.ref}
 ghstack PR head: https://github.com/pytorch/executorch/tree/{pr.head.ref}
 Merge bot PR base: https://github.com/pytorch/executorch/tree/{orig_branch_merge_base}
-Merge bot PR head: https://github.com/pytorch/executorch/tree/{orig_branch_merge_head}"""
+Merge bot PR head: https://github.com/pytorch/executorch/tree/{orig_branch_merge_head}
+@diff-train-skip-merge"""
 
         existing_orig_pr = repo.get_pulls(
             head="pytorch:" + orig_branch_merge_head,
             base=orig_branch_merge_base,
-            state="open",
+            state="all",
         )
-        if existing_orig_pr.totalCount > 0:
+        if existing_orig_pr.totalCount > 0 and existing_orig_pr[0].title == pr.title:
             print(
                 f"PR for {orig_branch_merge_head} already exists {existing_orig_pr[0]}"
             )
@@ -128,7 +135,7 @@ def main():
 
     with Github(auth=Auth.Token(os.environ["GITHUB_TOKEN"])) as gh:
         repo = gh.get_repo(args.repo)
-        create_prs_for_orig_branch(get_pr_stack_from_number(args.pr, repo), repo)
+        create_prs_for_orig_branch(get_pr_stack_from_number(args.ref, repo), repo)
 
 
 if __name__ == "__main__":
 
@@ -1,28 +1,9 @@
 name: Android
 
 on:
-  push:
-    branches:
-      - main
-      - release/*
-    tags:
-      - ciflow/android/*
-  pull_request:
-    paths:
-      - .ci/docker/**
-      - .github/workflows/android.yml
-      - build/*android*.sh
-      - install_requirements.sh
-      - examples/demo-apps/android/**
-      - extension/android/**
-      - extension/benchmark/android/**
-      - extension/module/**
+  workflow_call:
   workflow_dispatch:
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
 jobs:
   build-llm-demo:
     name: build-llm-demo
 
@@ -15,6 +15,7 @@ on:
       - 'gh/mcr229/[0-9]+/base'
       - 'gh/swolchok/[0-9]+/base'
       - 'gh/SS-JIA/[0-9]+/base'
+      - 'gh/trivedivivek/[0-9]+/base'
 
 jobs:
   ghstack_merge_to_main:
@@ -32,9 +33,7 @@ jobs:
       run: |
         pip install pygithub
 
-        PR_NUMBER=$(echo "$GITHUB_REF" | grep -oE '[0-9]+')
-
-        python .github/scripts/propose_ghstack_orig_pr.py --pr $PR_NUMBER --repo pytorch/executorch
+        python .github/scripts/propose_ghstack_orig_pr.py --ref $GITHUB_REF --repo pytorch/executorch
       env:
         GITHUB_TOKEN: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }}
         GITHUB_REF: ${{ github.ref }}
@@ -347,6 +347,9 @@ jobs:
           exit 1
         fi
 
+  android:
+    uses: ./.github/workflows/_android.yml
+
   unittest:
     uses: ./.github/workflows/_unittest.yml
     with:
 
@@ -9,20 +9,50 @@
 from typing import cast
 
 import torch
-from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
 from executorch.backends.arm.tosa_quant_utils import dq_op
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch.library import impl, Library
+
+# Define lib with passthrough operators. The operators have no real meaning in edge IR
+# except for argument validaiton and a passthrough output. The operators will be used
+# when lowering to TOSA, e.g. a passthrough_to_tosa._transpose will not affect
+# the edge IR graph but will be lowered to a TOSA-TRANSPOSE.
+lib = Library("passthrough_to_tosa", "DEF")
+# For operators that change the rank of the input, such as unsqueeze and squeeze, we may need
+# to switch dim_order before the opertation. Changing tosa_dim_order is not sufficient
+# as we also need transpose the data into the correct data format.
+# By utilizing an edge IR passthrough operator we can keep the edge program in
+# channels-first/contiguous and get the desired behavior in the TOSA lowering.
+lib.define("_transpose(Tensor self, int[] dim_order) -> Tensor")
+
+
+@impl(lib, "_transpose")
+def _transpose_impl(*args, **kwargs):
+    # Validate length of dim_order array
+    dim = args[1]
+    assert len(dim) <= 4
+    # Pass-through in edge-IR
+    return args[0]
 
 
 class AnnotateChannelsLastDimOrder(ExportPass):
     """
     Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
-    that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes.
-    The annotated tosa_dim_order is used to permute the node's shape such that it
-    gives a TOSA-compliant shape.
+    that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts passthrough_to_tosa._transpose
+    when a transition between 3D and 4D tensors happen.
+    The annotated tosa_dim_order is used to permute the node's shape such that it gives a TOSA-compliant shape.
     """
 
+    NHWC_order = (0, 2, 3, 1)
+    NHWC_inverse_order = (0, 3, 1, 2)
+    HWCM_order = (2, 3, 0, 1)
+
     def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
         """
         returns True for dq and w in the following sequences;
@@ -49,20 +79,56 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
 
         return False
 
+    def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target == exir_ops.edge.aten.squeeze_copy.dims:
+                input_node = node.args[0]
+                if input_node.meta["val"].dim() == 4:
+                    with graph_module.graph.inserting_before(node):
+                        permute_node = create_node(
+                            graph_module.graph,
+                            torch.ops.passthrough_to_tosa._transpose,
+                            args=(input_node, list(self.NHWC_inverse_order)),
+                        )
+                        permute_node.meta["tosa_dim_order"] = tuple(
+                            range(len(input_node.meta["val"].size()))
+                        )
+                        node.replace_input_with(input_node, permute_node)
+
+            if node.target == exir_ops.edge.aten.unsqueeze_copy.default:
+                if node.meta["val"].dim() == 4:
+                    with graph_module.graph.inserting_after(node):
+                        permute_node = create_node(
+                            graph_module.graph,
+                            torch.ops.passthrough_to_tosa._transpose,
+                            args=(node, list(self.NHWC_order)),
+                        )
+                        permute_node.meta["tosa_dim_order"] = self.NHWC_order
+                        node.meta["tosa_dim_order"] = (0, 1, 2, 3)
+                        users = [user for user in node.users if user != permute_node]
+                        for user in users:
+                            user.replace_input_with(node, permute_node)
+
     def call(self, graph_module: torch.fx.GraphModule):
-        NHWC_Order = (0, 2, 3, 1)
-        HWCM_Order = (2, 3, 0, 1)
         for node in graph_module.graph.nodes:
             node_data = get_first_fake_tensor(node).data
 
-            if len(node_data.shape) == 4:
-                dim_order = NHWC_Order
+            if node_data.dim() == 4:
+                dim_order = self.NHWC_order
                 if self.is_weight_node_for_depthwise_conv2d(node):
                     # The weights of TOSA DEPTHWISE_CONV2D have shape (H, W, C, M) which corresponds to
                     # dim_order = (2, 3, 0, 1) (https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d).
-                    dim_order = HWCM_Order
+                    dim_order = self.HWCM_order
             else:
                 dim_order = tuple(range(node_data.dim()))
             node.meta["tosa_dim_order"] = dim_order
+        # Take care of cases when:
+        # 4D (NHWC) -> >4D (NCH)
+        # 3D (NCH)  ->  4D (NHWC)
+        self.insert_tosa_transposes(graph_module)
         graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+
         return PassResult(graph_module, True)
@@ -19,6 +19,11 @@
     ConvertSplitToSlicePass,
 )
 from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
+from executorch.backends.arm._passes.decompose_layernorm_pass import (
+    DecomposeLayerNormPass,
+)
+from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
 from executorch.backends.arm._passes.insert_squeeze_after_sum_pass import (
     InsertSqueezeAfterSumPass,
 )
@@ -53,7 +58,10 @@ def transform_to_backend_pipeline(
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(RemoveClonePass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(DecomposeLayerNormPass())
+        self.add_pass(DecomposeVarPass())
         self.add_pass(ConvertMeanDimToAveragePool())
+        self.add_pass(DecomposeMeanDimPass())
         self.add_pass(MatchArgRanksPass(exported_program))
         self.add_pass(DecomposeDivPass())
         self.add_pass(InsertSqueezeAfterSumPass())
@@ -67,6 +75,9 @@ def transform_to_backend_pipeline(
         return self._transform(exported_program.graph_module)
 
     def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule):
+        self.add_pass(DecomposeLayerNormPass())
+        self.add_pass(DecomposeVarPass())
+        self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeDivPass())
         return self._transform(graph_module)