pytorch
diff --git a/‎.ci/scripts/test.sh
Lines changed: 2 additions & 3 deletions b/‎.ci/scripts/test.sh
Lines changed: 2 additions & 3 deletions
diff --git a/‎.ci/scripts/test_quantized_aot_lib.sh
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/test_quantized_aot_lib.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android.yml
Lines changed: 9 additions & 2 deletions b/‎.github/workflows/android.yml
Lines changed: 9 additions & 2 deletions
diff --git a/‎.github/workflows/doc-build.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/doc-build.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 7 additions & 4 deletions b/‎CMakeLists.txt
Lines changed: 7 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 12 additions & 17 deletions b/‎README.md
Lines changed: 12 additions & 17 deletions
diff --git a/‎backends/apple/mps/partition/mps_partitioner.py
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/partition/mps_partitioner.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/mps/setup.md
Lines changed: 66 additions & 35 deletions b/‎backends/apple/mps/setup.md
Lines changed: 66 additions & 35 deletions
diff --git a/‎backends/qualcomm/setup.md
Lines changed: 0 additions & 1 deletion b/‎backends/qualcomm/setup.md
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/vulkan/partitioner/vulkan_partitioner.py
Lines changed: 4 additions & 0 deletions b/‎backends/vulkan/partitioner/vulkan_partitioner.py
Lines changed: 4 additions & 0 deletions
@@ -37,7 +37,7 @@ build_cmake_executor_runner() {
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DBUCK2=buck2 -DCMAKE_BUILD_TYPE=Release \
+    && retry cmake -DCMAKE_BUILD_TYPE=Release \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4
@@ -84,8 +84,7 @@ build_cmake_xnn_executor_runner() {
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DBUCK2=buck2 \
-      -DCMAKE_BUILD_TYPE=Release \
+    && retry cmake -DCMAKE_BUILD_TYPE=Release \
       -DEXECUTORCH_BUILD_XNNPACK=ON \
       -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
@@ -21,8 +21,7 @@ build_cmake_quantized_aot_lib() {
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DBUCK2=buck2 \
-      -DCMAKE_BUILD_TYPE=Release \
+    && retry cmake -DCMAKE_BUILD_TYPE=Release \
       -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
@@ -99,7 +99,7 @@ build_executorch_runner_cmake() {
   pushd "${CMAKE_OUTPUT_DIR}" || return
   # This command uses buck2 to gather source files and buck2 could crash flakily
   # on MacOS
-  retry cmake -DBUCK2=buck2 -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE=Release ..
+  retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE=Release ..
   popd || return
 
   if [ "$(uname)" == "Darwin" ]; then
 
@@ -49,11 +49,18 @@ jobs:
         bash build/test_android_ci.sh
 
         mkdir -p artifacts-to-be-uploaded
+        mkdir -p artifacts-to-be-uploaded/arm64-v8a/
+        mkdir -p artifacts-to-be-uploaded/x86_64/
+        # Copy the jar to S3
+        cp extension/android/build/libs/executorch.jar artifacts-to-be-uploaded/
         # Copy the app and its test suite to S3
         cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/
         cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/
-        # Also copy the share libraries
-        cp cmake-out-android/lib/*.a artifacts-to-be-uploaded/
+        # Also copy the libraries
+        cp cmake-out-android-arm64-v8a/lib/*.a artifacts-to-be-uploaded/arm64-v8a/
+        cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/arm64-v8a/
+        cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/x86_64/
+        cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/x86_64/
 
   # Upload the app and its test suite to S3 so that they can be downloaded by the test job
   upload-artifacts:
 
@@ -8,6 +8,7 @@ on:
       - release/*
     tags:
       - v[0-9]+.[0-9]+.[0-9]+
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
   schedule:
     - cron: '0 0 * * *'
@@ -48,7 +49,7 @@ jobs:
 
         GITHUB_REF=${{ github.ref }}
         echo "$GITHUB_REF"
-        ET_VERSION_DOCS="${GITHUB_REF}"
+        export ET_VERSION_DOCS="${GITHUB_REF}"
         echo "$ET_VERSION_DOCS"
 
         set -eux
 
@@ -558,10 +558,6 @@ if(EXECUTORCH_BUILD_PYBIND)
     list(APPEND _dep_libs xnnpack_backend XNNPACK)
   endif()
 
-  if(EXECUTORCH_BUILD_CUSTOM)
-    list(APPEND _dep_libs custom_ops)
-  endif()
-
   if(EXECUTORCH_BUILD_QUANTIZED)
     target_link_options_shared_lib(quantized_ops_lib)
     list(APPEND _dep_libs quantized_kernels quantized_ops_lib)
@@ -571,6 +567,13 @@ if(EXECUTORCH_BUILD_PYBIND)
   if(EXECUTORCH_BUILD_CUSTOM_OPS_AOT AND NOT APPLE)
     list(APPEND _dep_libs custom_ops_aot_lib)
   endif()
+  # TODO(laryliu): Fix linux duplicate registation problem. In GH CI worker
+  # libcustom_ops.a doesn't dedup with the one indirectly linked from
+  # libcustom_ops_aot_lib.a
+  if(EXECUTORCH_BUILD_CUSTOM AND APPLE)
+    target_link_options_shared_lib(custom_ops)
+    list(APPEND _dep_libs custom_ops)
+  endif()
   # compile options for pybind
 
   set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
 
@@ -20,31 +20,26 @@ Key value propositions of ExecuTorch are:
 For a comprehensive technical overview of ExecuTorch and step-by-step tutorials,
 please visit our documentation website [for the latest release](https://pytorch.org/executorch/stable/index.html) (or the [main branch](https://pytorch.org/executorch/main/index.html)).
 
-## Important: This is a preview release
+## Feedback
 
-This is a preview version of ExecuTorch and should be used for testing and
-evaluation purposes only. It is not recommended for use in production settings.
 We welcome any feedback, suggestions, and bug reports from the community to help
-us improve the technology. Please use the [PyTorch
+us improve our technology. Please use the [PyTorch
 Forums](https://discuss.pytorch.org/c/executorch) for discussion and feedback
 about ExecuTorch using the **ExecuTorch** category, and our [GitHub
 repository](https://github.com/pytorch/executorch/issues) for bug reporting.
 
-The ExecuTorch code and APIs are still changing quickly, and there are not yet
-any guarantees about forward/backward source compatibility. We recommend using
-the latest `v#.#.#` release tag from the
-[Releases](https://github.com/pytorch/executorch/releases) page when
-experimenting with this preview release.
+We recommend using the latest release tag from the
+[Releases](https://github.com/pytorch/executorch/releases) page when developing.
 
 ## Directory Structure
 
 ```
 executorch
 ├── backends                        #  Backend delegate implementations.
 ├── build                           #  Utilities for managing the build system.
-├── bundled_program                 #  Utilities for attaching reference inputs and outputs to models. TODO move to extension
-├── codegen                         #  Tooling to autogenerate bindings between kernels and the runtime. TODO move to tool
-├── configurations                  #  TODO delete this
+├── bundled_program                 #  Utilities for attaching reference inputs and outputs to models.
+├── codegen                         #  Tooling to autogenerate bindings between kernels and the runtime.
+├── configurations
 ├── docs                            #  Static docs tooling
 ├── examples                        #  Examples of various user flows, such as model export, delegates, and runtime execution.
 ├── exir                            #  Ahead of time library, model capture and lowering apis.
@@ -69,20 +64,20 @@ executorch
 |   ├── portable                    #  Reference implementations of ATen operators.
 |   ├── prim_ops                    #  Special ops used in executorch runtime for control flow and symbolic primitives.
 |   ├── quantized
-├── profiler                        #  Utilities for profiling. TODO delete in favor of ETDump in sdk/
-├── runtime                         #  core cpp runtime of executorch
+├── profiler                        #  Utilities for profiling.
+├── runtime                         #  Core cpp runtime
 |   ├── backend                     #  Backend delegate runtime APIs
 |   ├── core                        #  Core structures used across all levels of the runtime
 |   ├── executor                    #  Model loading, initalization, and execution.
 |   ├── kernel                      #  Kernel registration and management.
 |   ├── platform                    #  Layer between architecture specific code and user calls.
-├── schema                          #  ExecuTorch program definition, TODO move under serialization/
+├── schema                          #  ExecuTorch program definition
 ├── scripts                         #  Utility scripts for size management, dependency management, etc.
 ├── sdk                             #  Model profiling, debugging, and introspection.
 ├── shim                            #  Compatibility layer between OSS and Internal builds
 ├── test                            #  Broad scoped end2end tests
-├── third-party                     #  third-party dependencies
-├── util                            #  TODO delete this
+├── third-party                     #  Third-party dependencies
+├── util
 ```
 
 ## License
 
@@ -102,7 +102,7 @@ def use_metal_kernel(self, node: torch.fx.Node):
     def tag_nodes(self, partitions: List[Partition]) -> None:
         for partition in partitions:
             crt_partition_counter = 0
-            for node in sorted(partition.nodes):
+            for node in partition.nodes:
                 delegation_tag = f"mps_{partition.id}"
                 if self.use_metal_kernel(node):
                     logging.warning(f"[WARNING] Using Metal kernel for op {node.name}!")
 
@@ -15,15 +15,28 @@ The MPS backend device maps machine learning computational graphs and primitives
 * [Introduction to ExecuTorch](intro-how-it-works.md)
 * [Setting up ExecuTorch](getting-started-setup.md)
 * [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [ExecuTorch iOS Demo App](demo-apps-ios.md)
+* [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md)
 :::
 ::::
 
 
 ## Prerequisites (Hardware and Software)
 
-In order to be able to successfully build and run a model using the MPS backend for ExecuTorch, you'll need the following hardware and software components.
- - macOS 12 / iOS 15 or later (for MPS runtime)
- - Xcode command-line tools: xcode-select --install
+In order to be able to successfully build and run a model using the MPS backend for ExecuTorch, you'll need the following hardware and software components:
+
+### Hardware:
+ - A [mac](https://www.apple.com/mac/) for tracing the model
+
+### Software:
+
+  - **Ahead of time** tracing:
+    - [macOS](https://www.apple.com/macos/) 12
+
+  - **Runtime**:
+    - [macOS](https://www.apple.com/macos/) >= 12.4
+    - [iOS](https://www.apple.com/ios) >= 15.4
+    - [Xcode](https://developer.apple.com/xcode/) >= 14.1
 
 ## Setting up Developer Environment
 
@@ -40,47 +53,34 @@ In order to be able to successfully build and run a model using the MPS backend
 ### AOT (Ahead-of-time) Components
 
 **Compiling model for MPS delegate**:
-- In this step, you will generate a simple ExecuTorch program that lowers MobileNetV3 model to the MPS delegate. You'll then pass this Program(the `.pte` file) during the runtime to run it using the MPS backend.
+- In this step, you will generate a simple ExecuTorch program that lowers MobileNetV3 model to the MPS delegate. You'll then pass this Program (the `.pte` file) during the runtime to run it using the MPS backend.
 
 ```bash
 cd executorch
-python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --bundled
+# Note: `mps_example` script uses by default the MPSPartitioner for ops that are not yet supported by the MPS delegate. To turn it off, pass `--no-use_partitioner`.
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --bundled --use_fp16
+
+# To see all options, run following command:
+python3 -m examples.apple.mps.scripts.mps_example --help
 ```
 
 ### Runtime
 
-**Building the MPS executor runner**
-- In this step, you'll be building the `mps_executor_runner` that is able to run MPS lowered modules.
-
+**Building the MPS executor runner:**
 ```bash
-# Build the mps_executor_runner
+# In this step, you'll be building the `mps_executor_runner` that is able to run MPS lowered modules:
+cd executorch
+./examples/apple/mps/scripts/build_mps_executor_runner.sh
+```
+
+## Run the mv3 generated model using the mps_executor_runner
+
 ```bash
-# Build and install executorch
-cmake -DBUCK2="$BUCK" \
-          -DCMAKE_INSTALL_PREFIX=cmake-out \
-          -DCMAKE_BUILD_TYPE=Release \
-          -DEXECUTORCH_BUILD_SDK=ON \
-          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-          -DEXECUTORCH_BUILD_MPS=ON \
-          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-          -Bcmake-out .
-cmake --build cmake-out -j9 --target install --config Release
-CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
-# build mps_executor_runner
-rm -rf cmake-out/examples/apple/mps
-cmake \
-    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-    -Bcmake-out/examples/apple/mps \
-    examples/apple/mps
-
-cmake --build cmake-out/examples/apple/mps -j9 --config Release
-
-# Run the mv2 generated model using the mps_executor_runner
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program
+```
 
-# You should see the following results. Note that no output file will be generated in this example:
+- You should see the following results. Note that no output file will be generated in this example:
+```
 I 00:00:00.003290 executorch:mps_executor_runner.mm:286] Model file mv3_mps_bundled_fp16.pte is loaded.
 I 00:00:00.003306 executorch:mps_executor_runner.mm:292] Program methods: 1
 I 00:00:00.003308 executorch:mps_executor_runner.mm:294] Running method forward
@@ -94,12 +94,43 @@ I 00:00:00.118731 executorch:mps_executor_runner.mm:438] Model executed successf
 I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successfully.
 ```
 
+### [Optional] Run the generated model directly using pybind
+1. Make sure `pybind` MPS support was installed:
+```bash
+./install_requirements.sh --pybind mps
+```
+2. Run the `mps_example` script to trace the model and run it directly from python:
+```bash
+cd executorch
+# Check correctness between PyTorch eager forward pass and ExecuTorch MPS delegate forward pass
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp16 --check_correctness
+# You should see following output: `Results between ExecuTorch forward pass with MPS backend and PyTorch forward pass for mv3_mps are matching!`
+
+# Check performance between PyTorch MPS forward pass and ExecuTorch MPS forward pass
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp16 --bench_pytorch
+```
+
+### Profiling:
+1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while you're exporting your model.
+```bash
+cd executorch
+python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
+```
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./sdk-etdump.md).
+```
+./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
+```
+3. Create an instance of the Inspector API by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
+```bash
+python3 -m sdk.inspector.inspector_cli --etdump_path etdump.etdp --etrecord_path etrecord.bin
+```
+
 ## Deploying and Running on Device
 
 ***Step 1***. Create the ExecuTorch core and MPS delegate frameworks to link on iOS
 ```bash
 cd executorch
-./build/build_apple_frameworks.sh --Release --mps
+./build/build_apple_frameworks.sh --mps
 ```
 
 `mps_delegate.xcframework` will be in `cmake-out` folder, along with `executorch.xcframework` and `portable_delegate.xcframework`:
@@ -123,4 +154,4 @@ In this tutorial, you have learned how to lower a model to the MPS delegate, bui
 
 ## Frequently encountered errors and resolution.
 
-If you encountered any bugs or issues following this tutorial please file a bug/issue on the ExecuTorch repository, with hashtag **#mps**.
+If you encountered any bugs or issues following this tutorial please file a bug/issue on the [ExecuTorch repository](https://github.com/pytorch/executorch/issues), with hashtag **#mps**.
@@ -93,7 +93,6 @@ mkdir build_android
 cd build_android
 # build executorch & qnn_executorch_backend
 cmake .. \
-    -DBUCK2=buck2 \
     -DCMAKE_INSTALL_PREFIX=$PWD \
     -DEXECUTORCH_BUILD_QNN=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
 
@@ -52,6 +52,10 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.convolution.default,
             # Normalization
             exir_ops.edge.aten.native_layer_norm.default,
+            # Shape-related operators
+            exir_ops.edge.aten.select_copy.int,
+            exir_ops.edge.aten.unsqueeze_copy.default,
+            exir_ops.edge.aten.view_copy.default,
             # Other
             operator.getitem,
             exir_ops.edge.aten.full.default,