pytorch
diff --git a/‎.ci/scripts/gather_benchmark_configs.py
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/gather_benchmark_configs.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/apple-perf-private-device-experiment.yml
Lines changed: 64 additions & 0 deletions b/‎.github/workflows/apple-perf-private-device-experiment.yml
Lines changed: 64 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/aot/TARGETS
Lines changed: 20 additions & 22 deletions b/‎backends/cadence/aot/TARGETS
Lines changed: 20 additions & 22 deletions
diff --git a/‎backends/cadence/aot/fuse_ops.py
Lines changed: 2 additions & 3 deletions b/‎backends/cadence/aot/fuse_ops.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/cadence/aot/replace_ops.py
Lines changed: 29 additions & 0 deletions b/‎backends/cadence/aot/replace_ops.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 32 additions & 9 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 32 additions & 9 deletions
@@ -24,6 +24,7 @@
     "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
     "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
     "google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
+    "apple_iphone_15_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",
 }
 
 # Predefined benchmark configurations
 
@@ -0,0 +1,64 @@
+name: apple-perf (private devices)
+
+on:
+  # TODO (huydhn): Disable the schedule run until we land the change to add device pool and device name
+  # to separate between public and private iOS devices
+  # schedule:
+  # - cron: 0 0,4,8,12,16,20 * * *
+  pull_request:
+    paths:
+      - .github/workflows/apple-perf-private-device-experiment.yml
+  # push:
+  #   branches:
+  #     - main
+  #   paths:
+  #     - .github/workflows/apple-perf-private-device-experiment.yml
+  # Note: GitHub has an upper limit of 10 inputs
+  workflow_dispatch:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: apple_iphone_15_private
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: apple_iphone_15_private
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+
+concurrency:
+  group: apple-perf-private-devices-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  apple:
+    uses: ./.github/workflows/apple-perf.yml
+    secrets: inherit
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
+      devices: apple_iphone_15_private
+      benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -399,7 +399,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="51504"
+        threshold="51408"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
@@ -436,7 +436,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="51784"
+        threshold="47552"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
 
@@ -6,12 +6,12 @@
 
 load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 load(
     "@fbsource//tools/build_defs:default_platform_defs.bzl",
     "CXX",
 )
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "executorch_generated_lib")
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 
 oncall("odai_jarvis")
 
@@ -36,18 +36,18 @@ python_library(
         "compiler.py",
     ],
     deps = [
-        ":passes",
-        ":utils",
+        ":memory_planning",
         ":ops_registrations",
+        ":passes",
         ":replace_ops",
-        ":memory_planning",
+        ":utils",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot/quantizer:fusion_pass",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
         "//executorch/backends/transforms:decompose_sdpa",
         "//executorch/backends/transforms:remove_clone_ops",
-        "//executorch/exir:lib",
         "//executorch/devtools:lib",
+        "//executorch/exir:lib",
     ],
 )
 
@@ -57,19 +57,19 @@ python_library(
         "export_example.py",
     ],
     deps = [
-        ":passes",
-        ":utils",
         ":ops_registrations",
+        ":passes",
         ":replace_ops",
+        ":utils",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot/quantizer:fusion_pass",
-        "//executorch/backends/cadence/runtime:runtime",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
-        "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
+        "//executorch/backends/cadence/runtime:runtime",
         "//executorch/backends/transforms:decompose_sdpa",
         "//executorch/backends/transforms:remove_clone_ops",
-        "//executorch/exir:lib",
+        "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
         "//executorch/devtools:lib",
+        "//executorch/exir:lib",
     ],
 )
 
@@ -94,12 +94,12 @@ python_library(
         "passes.py",
     ],
     deps = [
-        ":utils",
         ":fuse_ops",
-        ":simplify_ops",
-        ":replace_ops",
-        ":reorder_ops",
         ":remove_ops",
+        ":reorder_ops",
+        ":replace_ops",
+        ":simplify_ops",
+        ":utils",
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
@@ -131,7 +131,6 @@ python_library(
     ],
 )
 
-
 export_file(name = "functions.yaml")
 
 executorch_generated_lib(
@@ -191,9 +190,9 @@ python_library(
     ],
     typing = True,
     deps = [
-        "//caffe2:torch",
-        ":ops_registrations",
         ":compiler_utils",
+        ":ops_registrations",
+        "//caffe2:torch",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:utils",
         "//executorch/exir:pass_base",
@@ -228,11 +227,11 @@ python_library(
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/backends/cadence/aot:simplify_ops",
+        "//executorch/backends/transforms:remove_clone_ops",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
         "//executorch/exir/dialects/edge:lib",
         "//executorch/exir/passes:spec_prop_pass",
-        "//executorch/backends/transforms:remove_clone_ops"
     ],
 )
 
@@ -283,13 +282,13 @@ python_unittest(
     ],
     typing = True,
     deps = [
+        ":ops_registrations",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:graph_builder",
         "//executorch/backends/cadence/aot:pass_utils",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
         "//later:lib",
-        ":ops_registrations"
     ],
 )
 
@@ -319,8 +318,10 @@ python_unittest(
     srcs = [
         "tests/test_fusion_ops_passes.py",
     ],
+    supports_static_listing = False,
     typing = True,
     deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
         ":compiler",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot:compiler",
@@ -391,7 +392,6 @@ python_unittest(
     ],
 )
 
-
 python_library(
     name = "memory_planning",
     srcs = [
@@ -409,7 +409,6 @@ python_library(
     ],
 )
 
-
 python_library(
     name = "memory_constraints",
     srcs = [
@@ -425,7 +424,6 @@ python_library(
     ],
 )
 
-
 python_unittest(
     name = "test_memory_passes",
     srcs = [
 
@@ -901,9 +901,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 class FuseTransposeOpPairsPass(FuseOpPairsAcrossBranchesPass):
     """
-    Fuse dequantize-quantize op pairs to a single requantize op.
-    For the special case where quant params match, this will remove
-    both dequant and quant ops.
+    Fuse transpose op pairs to a single view op.
     """
 
     # A list of ops that can be bypassed when looking for a
@@ -915,6 +913,7 @@ class FuseTransposeOpPairsPass(FuseOpPairsAcrossBranchesPass):
         exir_ops.edge.cadence.dequantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.cadence.quantized_relu.per_tensor,
     }
 
     def can_fuse_for_chain(
 
@@ -2259,6 +2259,34 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         return result
 
 
+
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class ReplacePowWithMullPass(ExportPass):
+    """
+    Replace the pow op with degree 2 for a mul op.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        # TODO(eigen): Add support for other degrees.
+        if op not in {
+            exir_ops.edge.aten.pow.Scalar,
+        } or args[0] != 2:
+            return super().call_operator(op, args, kwargs, meta)
+
+        return super().call_operator(
+            exir_ops.edge.aten.mul.Tensor,
+            (args[1], args[1]),
+            {},
+            meta,
+        )
+
+
 # This class encapsulates all the functions that replace/switch one op in the
 # graph with another.
 class CadenceReplaceOpsInGraph:
@@ -2299,4 +2327,5 @@ class CadenceReplaceOpsInGraph:
         ReplaceWhereWithFullArgsWithWhereScalar,
         ReplaceGeluWithApproximateGeluPass,
         ReplaceSplitWithSlicePass,
+        ReplacePowWithMullPass,
     ]
@@ -23,6 +23,8 @@
 from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ProxyValue
+from parameterized import parameterized
 from torch import nn
 
 
@@ -485,39 +487,60 @@ def test_fuse_then_transpose_pass(self):
 
 
 class TestFuseTransposeOpPairsPass(TestFusionPassesBase):
-    def test_fuse_transpose_pairs(self):
+    def _create_operator(
+        self, builder: GraphBuilder, op: torch._ops.OpOverload, x: ProxyValue
+    ) -> ProxyValue:
+        if op == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default:
+            return builder.call_operator(
+                op=op,
+                args=(x, 1.2, 3, 0, 127, torch.int8),
+            )
+        elif op == exir_ops.edge.cadence.quantized_relu.per_tensor:
+            return builder.call_operator(
+                op=op,
+                args=(x, 0, 0, 0, 0),
+            )
+        else:
+            raise ValueError(f"Unsupported op: {op}")
+
+    @parameterized.expand(
+        [
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            exir_ops.edge.cadence.quantized_relu.per_tensor,
+        ],
+    )
+    def test_fuse_transpose_pairs(self, op: torch._ops.OpOverload):
         # Create a graph with transpose -> quant -> transpose.
         builder = GraphBuilder()
         x = builder.placeholder("x", torch.randn(2, 3))
         transpose_node = builder.call_operator(
             op=exir_ops.edge.aten.transpose_copy.int,
             args=(x, 0, 1),
         )
-        quant_node = builder.call_operator(
-            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-            args=(transpose_node, 1.2, 3, 0, 127, torch.int8),
-        )
+        quant_node = self._create_operator(builder, op, transpose_node)
         transpose_node = builder.call_operator(
             op=exir_ops.edge.aten.transpose_copy.int,
             args=(quant_node, 0, 1),
         )
-        builder.output(transpose_node)
+        builder.output([transpose_node])
         gm = builder.get_graph_module()
         self.check_op_counts(
             gm,
             expected_op_counts={
                 exir_ops.edge.aten.transpose_copy.int: 2,
-                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
+                op: 1,
             },
         )
 
         # Check that the pass fuses the two transpose ops.
-        gm_after_pass = FuseTransposeOpPairsPass()(gm).graph_module
+        fusion_pass_result = FuseTransposeOpPairsPass()(gm)
+        self.assertIsNotNone(fusion_pass_result)
+        gm_after_pass = fusion_pass_result.graph_module
         self.check_op_counts(
             gm_after_pass,
             expected_op_counts={
                 exir_ops.edge.aten.transpose_copy.int: 0,
-                exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
+                op: 1,
             },
         )
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@`
`24`	`24`	`"samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",`
`25`	`25`	`"google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",`
`26`	`26`	`"google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",`
	`27`	`+ "apple_iphone_15_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",`
`27`	`28`	`}`
`28`	`29`
`29`	`30`	`# Predefined benchmark configurations`