pytorch
diff --git a/‎.ci/scripts/gather_benchmark_configs.py
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/gather_benchmark_configs.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/android-release-artifacts.yml
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/android-release-artifacts.yml
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/workflows/apple-perf-private-device-experiment.yml
Lines changed: 64 additions & 0 deletions b/‎.github/workflows/apple-perf-private-device-experiment.yml
Lines changed: 64 additions & 0 deletions
diff --git a/‎.github/workflows/doc-build.yml
Lines changed: 14 additions & 0 deletions b/‎.github/workflows/doc-build.yml
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 4 additions & 4 deletions b/‎CONTRIBUTING.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎README-wheel.md
Lines changed: 1 addition & 1 deletion b/‎README-wheel.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/test/setup.md
Lines changed: 8 additions & 8 deletions b/‎backends/apple/coreml/runtime/test/setup.md
Lines changed: 8 additions & 8 deletions
diff --git a/‎backends/apple/coreml/setup.md
Lines changed: 2 additions & 2 deletions b/‎backends/apple/coreml/setup.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/mps/mps_preprocess.py
Lines changed: 15 additions & 1 deletion b/‎backends/apple/mps/mps_preprocess.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎backends/apple/mps/setup.md
Lines changed: 7 additions & 7 deletions b/‎backends/apple/mps/setup.md
Lines changed: 7 additions & 7 deletions
diff --git a/‎backends/arm/__init__.py
Lines changed: 10 additions & 0 deletions b/‎backends/arm/__init__.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/arm/_passes/convert_expand_copy_to_repeat.py
Lines changed: 12 additions & 2 deletions b/‎backends/arm/_passes/convert_expand_copy_to_repeat.py
Lines changed: 12 additions & 2 deletions
diff --git a/‎backends/arm/ethosu_backend.py
Lines changed: 2 additions & 2 deletions b/‎backends/arm/ethosu_backend.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/ethosu_partitioner.py
Lines changed: 1 addition & 2 deletions b/‎backends/arm/ethosu_partitioner.py
Lines changed: 1 addition & 2 deletions
@@ -24,6 +24,7 @@
     "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
     "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
     "google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
+    "apple_iphone_15_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",
 }
 
 # Predefined benchmark configurations
 
@@ -11,6 +11,11 @@ on:
         description: Upload the AAR to maven staging repository
         required: false
         type: boolean
+      flavor:
+        type: choice
+        options:
+          - "xnnpack"
+          - "vulkan+xnnpack"
   schedule:
     - cron: 0 10 * * *
 
@@ -86,6 +91,11 @@ jobs:
           sed -i "s/\(coordinates(\"org.pytorch\", \"executorch-android\", \"\)\([0-9]\+.[0-9]\+.[0-9]\+\)\(\")\)/\1$VERSION\3/" extension/android/executorch_android/build.gradle
         fi
 
+        FLAVOR="${{ inputs.flavor }}"
+        if [[ "$FLAVOR" == "vulkan+xnnpack" ]]; then
+          export EXECUTORCH_BUILD_VULKAN=ON
+        fi
+
         # Build AAR Package
         mkdir aar-out
         export BUILD_AAR_DIR=aar-out
 
@@ -0,0 +1,64 @@
+name: apple-perf (private devices)
+
+on:
+  # TODO (huydhn): Disable the schedule run until we land the change to add device pool and device name
+  # to separate between public and private iOS devices
+  # schedule:
+  # - cron: 0 0,4,8,12,16,20 * * *
+  pull_request:
+    paths:
+      - .github/workflows/apple-perf-private-device-experiment.yml
+  # push:
+  #   branches:
+  #     - main
+  #   paths:
+  #     - .github/workflows/apple-perf-private-device-experiment.yml
+  # Note: GitHub has an upper limit of 10 inputs
+  workflow_dispatch:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: apple_iphone_15_private
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+  workflow_call:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: apple_iphone_15_private
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+
+concurrency:
+  group: apple-perf-private-devices-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  apple:
+    uses: ./.github/workflows/apple-perf.yml
+    secrets: inherit
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
+      devices: apple_iphone_15_private
+      benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -14,6 +14,20 @@ on:
     - cron: '0 0 * * *'
 
 jobs:
+  check-urls:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Check URLs
+        run: bash ./scripts/check_urls.sh
+
+  check-xrefs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Check Links
+        run: bash ./scripts/check_xrefs.sh
+
   build:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
 
@@ -399,7 +399,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="51504"
+        threshold="51408"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
@@ -436,7 +436,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="51784"
+        threshold="47552"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
 
@@ -45,11 +45,11 @@ executorch
 │   └── <a href="devtools/visualization">visualization</a> - Visualization tools for representing model structure and performance metrics.
 ├── <a href="docs">docs</a> - Static docs tooling and documentation source files.
 ├── <a href="examples">examples</a> - Examples of various user flows, such as model export, delegates, and runtime execution.
-├── <a href="exir">exir</a> - Ahead-of-time library: model capture and lowering APIs. EXport Intermediate Representation (EXIR) is a format for representing the result of <a href="https://pytorch.org/docs/stable/export.html">torch.export</a>. This directory contains utilities and passes for lowering the EXIR graphs into different <a href="/docs/source/ir-exir.md">dialects</a> and eventually suitable to run on target hardware.
+├── <a href="exir">exir</a> - Ahead-of-time library: model capture and lowering APIs. EXport Intermediate Representation (EXIR) is a format for representing the result of <a href="https://pytorch.org/docs/stable/export.html">torch.export</a>. This directory contains utilities and passes for lowering the EXIR graphs into different <a href="docs/source/ir-exir.md">dialects</a> and eventually suitable to run on target hardware.
 │   ├── <a href="exir/_serialize">_serialize</a> - Serialize final export artifact.
 │   ├── <a href="exir/backend">backend</a> - Backend delegate ahead of time APIs.
 │   ├── <a href="exir/capture">capture</a> - Program capture.
-│   ├── <a href="exir/dialects">dialects</a> - Op sets for various dialects in the export process. Please refer to the <a href="/docs/source/ir-exir.md">EXIR spec</a> and the <a href="/docs/source/compiler-backend-dialect.md">backend dialect</a> doc for more details.
+│   ├── <a href="exir/dialects">dialects</a> - Op sets for various dialects in the export process. Please refer to the <a href="docs/source/ir-exir.md">EXIR spec</a> and the <a href="docs/source/compiler-backend-dialect.md">backend dialect</a> doc for more details.
 │   ├── <a href="exir/emit">emit</a> - Conversion from ExportedProgram to ExecuTorch execution instructions.
 │   ├── <a href="exir/operator">operator</a> - Operator node manipulation utilities.
 │   ├── <a href="exir/passes">passes</a> - Built-in compiler passes.
@@ -68,7 +68,7 @@ executorch
 │   ├── <a href="extension/memory_allocator">memory_allocator</a> - 1st party memory allocator implementations.
 │   ├── <a href="extension/module">module</a> - A simplified C++ wrapper for the runtime. An abstraction that deserializes and executes an ExecuTorch artifact (.pte file). Refer to the <a href="docs/source/extension-module.md">module documentation</a> for more information.
 │   ├── <a href="extension/parallel">parallel</a> - C++ threadpool integration.
-│   ├── <a href="extension/pybindings">pybindings</a> - Python API for executorch runtime. This is powering up the <a href="docs/source/runtime-python-api-reference.md">runtime Python API</a> for ExecuTorch.
+│   ├── <a href="extension/pybindings">pybindings</a> - Python API for executorch runtime. This is powering up the <a href="docs/source/runtime-python-api-reference.rst">runtime Python API</a> for ExecuTorch.
 │   ├── <a href="extension/pytree">pytree</a> - C++ and Python flattening and unflattening lib for pytrees.
 │   ├── <a href="extension/runner_util">runner_util</a> - Helpers for writing C++ PTE-execution tools.
 │   ├── <a href="extension/tensor">tensor</a> - Tensor maker and <code>TensorPtr</code>, details in <a href="docs/source/extension-tensor.md">this documentation</a>. For how to use <code>TensorPtr</code> and <code>Module</code>, please refer to the <a href="docs/source/using-executorch-cpp.md">"Using ExecuTorch with C++"</a> doc.
@@ -114,7 +114,7 @@ If you're completely new to open-source projects, GitHub, or ExecuTorch, please
 1. If you've changed APIs or added a new tool or feature, [update the
    documentation](#updating-documentation).
 1. If you added an experimental API or deprecated an existing API, follow the
-   [API Life Cycle and Deprecation Policy](/docs/source/api-life-cycle.md).
+   [API Life Cycle and Deprecation Policy](docs/source/api-life-cycle.md).
 1. Make sure your code follows the [style guides](#coding-style) and passes the
    [lint checks](#lintrunner).
 1. If you haven't already, complete the [Contributor License Agreement ("CLA")](#contributor-license-agreement-cla).
 
@@ -25,6 +25,6 @@ tutorials and documentation. Here are some starting points:
 * [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
   * Learn the fundamentals of exporting a PyTorch `nn.Module` to ExecuTorch, and
     optimizing its performance using quantization and hardware delegation.
-* Running LLaMA on [iOS](docs/source/llm/llama-demo-ios) and [Android](docs/source/llm/llama-demo-android) devices.
+* Running LLaMA on [iOS](docs/source/llm/llama-demo-ios.md) and [Android](docs/source/llm/llama-demo-android.md) devices.
   * Build and run LLaMA in a demo mobile app, and learn how to integrate models
     with your own apps.
@@ -4,18 +4,18 @@ This is a tutorial for setting up tests for the **Core ML** backend.
 
 ## Running tests
 
-1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment.
+1. Follow the instructions described in [Setting Up ExecuTorch](../../../../../docs/source/getting-started-setup.rst) to set up ExecuTorch environment.
 
 2. Run `install_requirements.sh` to install dependencies required by the **Core ML** backend.
 
 ```bash
 cd executorch
 
-sh backends/apple/coreml/scripts/install_requirements.sh   
+sh backends/apple/coreml/scripts/install_requirements.sh
 
-``` 
+```
 
-3. Follow the instructions described in [Building with CMake](/docs/source/runtime-build-and-cross-compilation.md#building-with-cmake) to set up CMake build system.
+3. Follow the instructions described in [Building with CMake](../../../../../docs/source/using-executorch-cpp.md#building-with-cmake) to set up CMake build system.
 
 4. Install [Xcode](https://developer.apple.com/xcode/).
 
@@ -26,7 +26,7 @@ sh backends/apple/coreml/scripts/install_requirements.sh
 ```bash
 cd executorch
 
-# Builds macOS universal test bundle. 
+# Builds macOS universal test bundle.
 
 sh backends/apple/coreml/srcipts/build_tests.sh
 
@@ -40,15 +40,15 @@ cd executorch
 sh backends/apple/coreml/srcipts/run_tests.sh
 
 ```
- 
+
 ## Updating tests
 
 1. Open the Xcode workspace.
 
 ```bash
 cd executorch
 
-# Builds macOS universal test bundle. 
+# Builds macOS universal test bundle.
 
 open backends/apple/coreml/runtime/workspace/executorchcoreml.xcworkspace
 
@@ -62,4 +62,4 @@ cd executorch
 # There is no need to build the tests.
 sh backends/apple/coreml/srcipts/run_tests.sh
 
-```
+```
@@ -4,7 +4,7 @@ This is a tutorial for setting up the Core ML backend.
 
 ## AOT Setup
 
-1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment.
+1. Follow the instructions described in [Setting Up ExecuTorch](../../../docs/source/getting-started-setup.rst) to set up ExecuTorch environment.
 
 
 2. Run the example script to validate that the **Core ML** backend is set up correctly.
@@ -28,7 +28,7 @@ delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
 
 ## Integrating Core ML delegate into runtime.
 
-1. Follow the instructions described in [Building with CMake](/docs/source/runtime-build-and-cross-compilation.md#building-with-cmake) to set up CMake build system.
+1. Follow the instructions described in [Building with CMake](../../../docs/source/using-executorch-cpp.md#building-with-cmake) to set up CMake build system.
 
 2. Install [Xcode](https://developer.apple.com/xcode/).
 
 
@@ -6,6 +6,7 @@
 from typing import ClassVar, Dict, final, List, Tuple
 
 import torch
+from executorch import exir
 
 from executorch.backends.apple.mps.operators.node_visitor import (
     get_node_visitors,
@@ -35,6 +36,7 @@
 
 from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from executorch.exir.program._program import _transform
+from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
 from torch.export.exported_program import ExportedProgram
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -87,7 +89,19 @@ def preprocess(
         #    the `output_ids` array in the schema.
 
         # TODO: Remove this once we have a better support for the dim-order ops.
-        edge_program = _transform(edge_program, DimOrderOpsRevertPass())
+        # Need to override the verifier to skip the non dim-order ops from tripping the default verifier.
+        edge_program = _transform(
+            edge_program,
+            DimOrderOpsRevertPass(),
+            override_verifiers=[
+                EXIREdgeDialectVerifier(
+                    edge_compile_config=exir.EdgeCompileConfig(
+                        _check_ir_validity=False,  # Disable the edge dialect verifier, since we are in the mps backend.
+                    ),
+                    class_only=True,
+                )
+            ],
+        )
 
         mps_graph = MPSGraph(
             version="0",
 
@@ -12,11 +12,11 @@ The MPS backend device maps machine learning computational graphs and primitives
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
-* [ExecuTorch iOS Demo App](demo-apps-ios.md)
-* [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md)
+* [Introduction to ExecuTorch](../../../docs/source/intro-how-it-works.md)
+* [Setting up ExecuTorch](../../../docs/source/getting-started-setup.rst)
+* [Building ExecuTorch with CMake](../../../docs/source/using-executorch-cpp.md#building-with-cmake)
+* [ExecuTorch iOS Demo App](../../../docs/source/demo-apps-ios.md)
+* [ExecuTorch iOS LLaMA Demo App](../../../docs/source/llm/llama-demo-ios.md)
 :::
 ::::
 
@@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp
 ```
 
 ### Profiling:
-1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
+1. [Optional] Generate an [ETRecord](../../../docs/source/etrecord.rst) while you're exporting your model.
 ```bash
 cd executorch
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
 ```
-2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](../../../docs/source/etdump.md).
 ```
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
 ```
 
@@ -0,0 +1,10 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .arm_backend import ArmCompileSpecBuilder  # noqa  # usort: skip
+from .tosa_backend import TOSABackend  # noqa  # usort: skip
+from .tosa_partitioner import TOSAPartitioner  # noqa  # usort: skip
+from .ethosu_backend import EthosUBackend  # noqa  # usort: skip
+from .ethosu_partitioner import EthosUPartitioner  # noqa  # usort: skip
@@ -1,16 +1,18 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-# All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
+import logging
 from typing import cast
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
+logger = logging.getLogger(__name__)
+
 
 class ConvertExpandCopyToRepeatPass(ExportPass):
     """
@@ -41,6 +43,14 @@ def call_operator(self, op, args, kwargs, meta):
             multiples[i] if multiples[i] != -1 and extended_shape[i] == 1 else 1
             for i in range(expanded_rank)
         ]
+
+        if all((x == 1 for x in multiples)):
+            # All dimensions/repetitions occur only once. Remove node
+            # altogether since it's in practice just a copy.
+            logger.warning("Found redundant expand node (no-op). Removing it.")
+
+            return args[0]
+
         return super().call_operator(
             op=self.repeat, args=(args[0], multiples), kwargs=kwargs, meta=meta
         )
@@ -14,9 +14,9 @@
 import logging
 from typing import final, List
 
-from executorch.backends.arm.arm_vela import vela_compile
+from executorch.backends.arm import TOSABackend
 
-from executorch.backends.arm.tosa_backend import TOSABackend
+from executorch.backends.arm.arm_vela import vela_compile
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch.export.exported_program import ExportedProgram
 
@@ -10,8 +10,7 @@
 from executorch.backends.arm.arm_backend import (
     is_ethosu,
 )  # usort: skip
-from executorch.backends.arm.ethosu_backend import EthosUBackend
-from executorch.backends.arm.tosa_partitioner import TOSAPartitioner
+from executorch.backends.arm import EthosUBackend, TOSAPartitioner
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import DelegationSpec
 from torch.fx.passes.operator_support import OperatorSupportBase
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@`
`24`	`24`	`"samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",`
`25`	`25`	`"google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",`
`26`	`26`	`"google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",`
	`27`	`+ "apple_iphone_15_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/55929353-2f28-4ee5-bdff-d1a95f58cb28",`
`27`	`28`	`}`
`28`	`29`
`29`	`30`	`# Predefined benchmark configurations`