pytorch
diff --git a/‎.ci/scripts/unittest-buck2.sh
Lines changed: 4 additions & 4 deletions b/‎.ci/scripts/unittest-buck2.sh
Lines changed: 4 additions & 4 deletions
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 8 additions & 0 deletions b/‎.ci/scripts/utils.sh
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/workflows/_android.yml
Lines changed: 18 additions & 1 deletion b/‎.github/workflows/_android.yml
Lines changed: 18 additions & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 11 additions & 2 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 11 additions & 2 deletions
diff --git a/‎.github/workflows/android-release-artifacts.yml
Lines changed: 8 additions & 4 deletions b/‎.github/workflows/android-release-artifacts.yml
Lines changed: 8 additions & 4 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/pull.yml
Lines changed: 5 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 17 additions & 4 deletions b/‎CMakeLists.txt
Lines changed: 17 additions & 4 deletions
diff --git a/‎backends/arm/_passes/match_arg_ranks_pass.py
Lines changed: 13 additions & 36 deletions b/‎backends/arm/_passes/match_arg_ranks_pass.py
Lines changed: 13 additions & 36 deletions
diff --git a/‎backends/cadence/CMakeLists.txt
Lines changed: 3 additions & 2 deletions b/‎backends/cadence/CMakeLists.txt
Lines changed: 3 additions & 2 deletions
@@ -8,9 +8,7 @@ set -eux
 
 # TODO: expand this to //...
 # TODO: can't query cadence & vulkan backends
-# TODO: can't query //kernels/prim_ops because of a cpp_unittest and
-# broken code in shim to read oss.folly_cxx_tests. Sending fix but it
-# needs to propagate and we need a submodule update.
+# TODO: can't query //kernels/prim_ops because of non-buckified stuff in OSS.
 buck2 query "//backends/apple/... + //backends/example/... + \
 //backends/mediatek/... + //backends/test/... + //backends/transforms/... + \
 //backends/xnnpack/... + //configurations/... + //kernels/aten/... + \
@@ -20,7 +18,9 @@ buck2 query "//backends/apple/... + //backends/example/... + \
 UNBUILDABLE_OPTIMIZED_OPS_REGEX="gelu|fft_r2c|log_softmax"
 BUILDABLE_OPTIMIZED_OPS=$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
 
-BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep -v prim_ops_test_py)
+# TODO: build prim_ops_test_cpp again once supported_features works in
+# OSS buck.
+BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep -v prim_ops_test)
 # TODO: expand the covered scope of Buck targets.
 # //runtime/kernel/... is failing because //third-party:torchgen_files's shell script can't find python on PATH.
 # //runtime/test/... requires Python torch, which we don't have in our OSS buck setup.
 
@@ -20,6 +20,14 @@ clean_executorch_install_folders() {
   ./install_executorch.sh --clean
 }
 
+update_tokenizers_git_submodule() {
+  echo "Updating tokenizers git submodule..."
+  git submodule update --init
+  pushd extension/llm/tokenizers
+  git submodule update --init
+  popd
+}
+
 install_executorch() {
   which pip
   # Install executorch, this assumes that Executorch is checked out in the
 
@@ -29,8 +29,25 @@ jobs:
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         # Build LLM Demo for Android
+        export BUILD_AAR_DIR=aar-out
         bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
-        bash build/build_android_instrumentation.sh
+        bash build/build_android_instrumentation.sh ${ARTIFACTS_DIR_NAME}
+
+        mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
+        bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
+
+        mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
+        cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs
+        pushd examples/demo-apps/android/LlamaDemo
+        ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
+        popd
+
+        DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo"
+        # The app directory is named using its build flavor as a suffix.
+        mkdir -p "${DEMO_APP_DIR}"
+        # Collect the app and its test suite
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk "${DEMO_APP_DIR}"
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk "${DEMO_APP_DIR}"
 
   # Running Android emulator directly on the runner and not using Docker
   run-emulator:
 
@@ -362,8 +362,17 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
-        export ANDROID_ABIS="arm64-v8a"
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
+        mkdir -p aar-out
+        PYTHON_EXECUTABLE=python ANDROID_ABIS="arm64-v8a" BUILD_AAR_DIR=aar-out EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_library.sh
+        mkdir -p extension/benchmark/android/benchmark/app/libs
+        cp aar-out/executorch.aar extension/benchmark/android/benchmark/app/libs
+        pushd extension/benchmark/android/benchmark
+        ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
+        popd
+        MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench"
+        mkdir -p "${MINIBENCH_APP_DIR}"
+        cp extension/benchmark/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}"
+        cp extension/benchmark/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}"
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:
 
@@ -52,10 +52,14 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool buck2
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
-        # Build LLM Demo for Android
-        bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
+        # Build AAR Package
+        mkdir aar-out
+        export BUILD_AAR_DIR=aar-out
+        bash build/build_android_library.sh
+        mkdir -p "${ARTIFACTS_DIR_NAME}"
+        cp aar-out/executorch.aar "${ARTIFACTS_DIR_NAME}/executorch.aar"
 
-        shasum -a 256 "${ARTIFACTS_DIR_NAME}/llm_demo/executorch.aar"
+        shasum -a 256 "${ARTIFACTS_DIR_NAME}/executorch.aar"
 
   upload-release-aar:
     name: upload-release-aar
@@ -74,7 +78,7 @@ jobs:
       - name: Upload AAR RC to AWS S3
         shell: bash
         run: |
-          wget https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/executorch.aar
+          wget https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/executorch.aar
           shasum -a 256 executorch.aar > executorch.aar.sha256sums
 
           pip install awscli==1.32.18
 
@@ -361,6 +361,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        ./install_requirements.sh --use-pt-pinned-commit
         # build module for executorch.extension.pybindings.portable_lib
         bash test/build_size_test.sh
         strip cmake-out/test/size_test
@@ -396,6 +397,8 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        ./install_requirements.sh --use-pt-pinned-commit
+
         # build module for executorch.extension.pybindings.portable_lib
         bash test/build_size_test.sh
         strip cmake-out/test/size_test
@@ -510,6 +513,7 @@ jobs:
         MODE=${{ matrix.mode }}
         PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
 
+        ./install_requirements.sh --use-pt-pinned-commit
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
@@ -541,6 +545,7 @@ jobs:
 
         BUILD_TOOL="cmake"
 
+        ./install_requirements.sh --use-pt-pinned-commit
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
 
@@ -572,6 +572,7 @@ jobs:
         MODE=${{ matrix.mode }}
         PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
 
+        ./install_requirements.sh --use-pt-pinned-commit
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
 
@@ -42,7 +42,7 @@
 # It should also be cmake-lint clean.
 #
 
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24)
 project(executorch)
 include(build/Utils.cmake)
 include(CMakeDependentOption)
@@ -506,13 +506,18 @@ if(EXECUTORCH_BUILD_FLATC)
                -DFLATBUFFERS_BUILD_FLATLIB=${FLATBUFFERS_BUILD_FLATLIB}
                -DFLATBUFFERS_BUILD_TESTS=${FLATBUFFERS_BUILD_TESTS}
                -DFLATBUFFERS_INSTALL=${FLATBUFFERS_INSTALL}
-               -DCMAKE_BUILD_TYPE=Release
                -DCMAKE_CXX_FLAGS="-DFLATBUFFERS_MAX_ALIGNMENT=${FLATBUFFERS_MAX_ALIGNMENT}"
     INSTALL_COMMAND ""
     BUILD_BYPRODUCTS <BINARY_DIR>/flatc
   )
   ExternalProject_Get_Property(flatbuffers BINARY_DIR)
-  set(FLATC_EXECUTABLE ${BINARY_DIR}/flatc)
+  if(WIN32)
+    # flatbuffers does not use CMAKE_BUILD_TYPE. Internally, the build forces Release
+    # config, but from CMake's perspective the build type is always Debug.
+    set(FLATC_EXECUTABLE ${BINARY_DIR}/$<CONFIG>/flatc.exe)
+  else()
+    set(FLATC_EXECUTABLE ${BINARY_DIR}/flatc)
+  endif()
   set(FLATC_EXECUTABLE_BUILT_FROM_SOURCE YES)
 endif()
 
@@ -677,7 +682,7 @@ install(
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
-install(FILES build/executorch-config.cmake DESTINATION lib/cmake/ExecuTorch)
+install(FILES tools/cmake/executorch-config.cmake DESTINATION lib/cmake/ExecuTorch)
 
 #
 # executor_runner: Host tool that demonstrates program execution.
@@ -914,6 +919,14 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
     list(APPEND _executor_runner_libs quantized_ops_lib)
   endif()
 
+  if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
+    list(APPEND _executor_runner_libs $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
+  endif()
+
+  if(EXECUTORCH_BUILD_XNNPACK)
+    list(APPEND _executor_runner_libs xnnpack_backend)
+  endif()
+
   if(EXECUTORCH_ENABLE_EVENT_TRACER)
     if(EXECUTORCH_BUILD_DEVTOOLS)
       list(APPEND _executor_runner_libs etdump flatccrt)
 
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -23,7 +23,17 @@
 class MatchArgRanksPass(ExportPass):
     """
     For ops in 'targeted_ops', make sure that the inputs share the same rank.
-    New dimensions are inserted at from the beginning of the
+    New dimensions are inserted from the beginning of the inputs that have a
+    lower rank to match the input with the highest rank.
+
+    Example:
+        input0 = shape(4, 3, 2)
+        input1 = shape(2)
+        input2 = shape(3, 1)
+    Becomes:
+        input0 = shape(4, 3, 2)
+        input1 = shape(1, 1, 2)
+        input2 = shape(1, 3, 1)
     """
 
     def __init__(self, exported_program):
@@ -54,34 +64,6 @@ def _match_op_rank(self, graph_module, node, arg, max_rank):
             )
             node.replace_input_with(arg, view)
 
-    def _match_buffer_rank(self, arg, max_rank):
-        """
-        Change arg's fake tensor meta to match max_rank if:
-            - arg is found in inputs_to_buffers or inputs_to_parameters.
-        """
-        fake_tensor = get_first_fake_tensor(arg)
-        shape = fake_tensor.shape
-        rank = len(shape)
-        new_shape = list([1] * (max_rank - rank) + list(shape))
-
-        buffer_name = None
-        if arg.name in self.exported_program.graph_signature.inputs_to_buffers:
-            buffer_name = self.exported_program.graph_signature.inputs_to_buffers[
-                arg.name
-            ]
-        elif arg.name in self.exported_program.graph_signature.inputs_to_parameters:
-            buffer_name = self.exported_program.graph_signature.inputs_to_parameters[
-                arg.name
-            ]
-        if buffer_name:
-            new_tensor = self.exported_program.state_dict[buffer_name].reshape(
-                new_shape
-            )
-            self.exported_program.state_dict[buffer_name] = new_tensor
-            arg.meta["val"] = fake_tensor.fake_mode.from_tensor(
-                new_tensor, static_shapes=True
-            )
-
     def call(self, graph_module: GraphModule) -> PassResult:
         for node in graph_module.graph.nodes:
             node = cast(Node, node)
@@ -105,12 +87,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 if rank == max_rank:
                     continue
 
-                # If the argument is call_function, match shape by inserting view node.
-                if arg.op == "call_function":
-                    self._match_op_rank(graph_module, node, arg, max_rank)
-                else:
-                    # If the argument is a buffer or parameter, adjust shape by changing the fake tensor meta.
-                    self._match_buffer_rank(arg, max_rank)
+                self._match_op_rank(graph_module, node, arg, max_rank)
 
         graph_module.recompile()
         graph_module = super().call(graph_module).graph_module
 
@@ -28,7 +28,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..
 add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 if(EXECUTORCH_CADENCE_CPU_RUNNER)
-  include(${EXECUTORCH_ROOT}/scripts/build/Codegen.cmake)
+  include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
   if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
@@ -79,6 +79,7 @@ if(EXECUTORCH_NNLIB_OPT)
   set(TARGET_DIR hifi)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
   ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
 elseif(EXECUTORCH_FUSION_G3_OPT)
   set(TARGET_DIR fusion_g3)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
@@ -87,5 +88,5 @@ else()
   set(TARGET_DIR reference)
 endif()
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)