pytorch
diff --git a/‎.ci/scripts/test_qnn_static_llama.sh
Lines changed: 59 additions & 0 deletions b/‎.ci/scripts/test_qnn_static_llama.sh
Lines changed: 59 additions & 0 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/apple-perf.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/apple-perf.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 33 additions & 0 deletions b/‎.github/workflows/pull.yml
Lines changed: 33 additions & 0 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 4 additions & 0 deletions b/‎.lintrunner.toml
Lines changed: 4 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 13 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 13 additions & 2 deletions
diff --git a/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
Lines changed: 4 additions & 0 deletions b/‎backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎backends/arm/CMakeLists.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/test/models/test_dl3_arm.py
Lines changed: 92 additions & 0 deletions b/‎backends/arm/test/models/test_dl3_arm.py
Lines changed: 92 additions & 0 deletions
@@ -0,0 +1,59 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
+export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
+export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
+export PYTHONPATH=".."
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py
+pip install graphviz
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+echo "Creating tokenizer.bin"
+$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+set +e
+# Compile only as weight sharing is not applicable on x86
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
+exit_code1=$?
+
+# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
+exit_code2=$?
+
+# Check the exit codes and print messages
+if [ $exit_code1 -ne 0 ]; then
+    echo "Static Llama compile only with weight sharing test failed. $exit_code1."
+fi
+
+if [ $exit_code2 -ne 0 ]; then
+    echo "Static Llama accuracy test failed. $exit_code2."
+fi
+
+# Return failure if either program failed
+if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
+    exit 1
+else
+    exit 0
+fi
+set -e
@@ -222,6 +222,7 @@ jobs:
                       --preq_mode 8da4w_output_8da8w \
                       --preq_group_size 32 \
                       --max_seq_length 2048 \
+                      --max_context_length 2048 \
                       --output_name "${OUT_ET_MODEL_NAME}.pte" \
                       -kv \
                       -d fp32 \
@@ -253,6 +254,7 @@ jobs:
                       --xnnpack-extended-ops \
                       -d fp32 \
                       --max_seq_length 2048 \
+                      --max_context_length 2048 \
                       --output_name "${OUT_ET_MODEL_NAME}.pte" \
                       --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
 
@@ -233,6 +233,7 @@ jobs:
                 --preq_mode 8da4w_output_8da8w \
                 --preq_group_size 32 \
                 --max_seq_length 2048 \
+                --max_context_length 2048 \
                 --output_name "${OUT_ET_MODEL_NAME}.pte" \
                 -kv \
                 -d fp32 \
@@ -264,6 +265,7 @@ jobs:
                 --xnnpack-extended-ops \
                 -d fp32 \
                 --max_seq_length 2048 \
+                --max_context_length 2048 \
                 --output_name "${OUT_ET_MODEL_NAME}.pte" \
                 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
 
@@ -437,6 +437,39 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
+  test-static-llama-qnn-linux:
+    name: test-static-llama-qnn-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 180
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        BUILD_TOOL="cmake"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+
+        # Setup install_requirements for llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+        
+        # Test static llama weight sharing and accuracy
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
+
   test-qnn-models-linux:
     name: test-qnn-models-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
 
@@ -78,6 +78,8 @@ exclude_patterns = [
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
+    # Want to be able to keep c10 in sync with PyTorch core.
+    'runtime/core/portable_type/c10/**',
 ]
 command = [
     'python',
@@ -261,6 +263,8 @@ exclude_patterns = [
     'extension/**',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
+    # Want to be able to keep c10 in sync with PyTorch core.
+    'runtime/core/portable_type/c10/**',
     'runtime/executor/tensor_parser_aten.cpp',
     'scripts/**',
     'test/**',
 
@@ -373,7 +373,7 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
       "fix for this restriction."
   )
 endif()
-set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/..)
+set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -556,6 +556,7 @@ endif()
 target_include_directories(
   executorch_core PUBLIC ${_common_include_directories}
 )
+target_compile_definitions(executorch_core PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch_core PUBLIC ${_common_compile_options})
 if(MAX_KERNEL_NUM)
   target_compile_definitions(
@@ -576,6 +577,7 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE)
   target_include_directories(
     executorch_core_shared PUBLIC ${_common_include_directories}
   )
+  target_compile_definitions(executorch_core_shared PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
   target_compile_options(
     executorch_core_shared PUBLIC ${_common_compile_options}
   )
@@ -594,8 +596,9 @@ endif()
 # any backends.
 #
 add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PRIVATE executorch_core)
+target_link_libraries(executorch PUBLIC executorch_core)
 target_include_directories(executorch PUBLIC ${_common_include_directories})
+target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch PUBLIC ${_common_compile_options})
 target_link_options_shared_lib(executorch)
 
@@ -629,6 +632,12 @@ endif()
 
 # Install `executorch` library as well as `executorch-config.cmake` under
 # ${CMAKE_INSTALL_PREFIX}/
+install(DIRECTORY runtime/core/  DESTINATION include/executorch/runtime/core FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY runtime/kernel/  DESTINATION include/executorch/runtime/kernel FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY runtime/platform/  DESTINATION include/executorch/runtime/platform FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY extension/kernel_util/  DESTINATION include/executorch/extension/kernel_util FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY extension/tensor/  DESTINATION include/executorch/extension/tensor FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY extension/threadpool/  DESTINATION include/executorch/extension/threadpool FILES_MATCHING PATTERN "*.h")
 install(
   TARGETS executorch executorch_core
   DESTINATION lib
@@ -792,6 +801,8 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_include_directories(
     util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
   )
+  target_compile_definitions(util PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
+
   target_compile_options(util PUBLIC ${_pybind_compile_options})
   target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)
 
 
@@ -134,6 +134,8 @@ target_include_directories(
   coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
 )
 target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
+target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)
+target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
 target_link_libraries(coremldelegate PRIVATE executorch_core)
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
 
@@ -830,6 +830,7 @@
 				GCC_OPTIMIZATION_LEVEL = 0;
 				GCC_PREPROCESSOR_DEFINITIONS = (
 					"DEBUG=1",
+                                        "C10_USING_CUSTOM_GENERATED_MACROS",
 					"$(inherited)",
 				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
@@ -911,6 +912,7 @@
 				DEVELOPMENT_TEAM = "";
 				GCC_PREPROCESSOR_DEFINITIONS = (
 					"DEBUG=1",
+                                        "C10_USING_CUSTOM_GENERATED_MACROS",
 					"ET_EVENT_TRACER_ENABLED=1",
 					"$(inherited)",
 				);
@@ -920,6 +922,7 @@
 					"$(SRCROOT)/../kvstore",
 					"$(SRCROOT)/../inmemoryfs",
 					"$(SRCROOT)/../include",
+					"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
 					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
@@ -951,6 +954,7 @@
 					"$(SRCROOT)/../kvstore",
 					"$(SRCROOT)/../inmemoryfs",
 					"$(SRCROOT)/../include",
+					"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
 					"$(SRCROOT)/../sdk",
 					"$(SRCROOT)/../util",
 					"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
 
@@ -14,7 +14,8 @@ endif()
 
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type)
+add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
 # Third-party folder and Ethos-U driver inclued
 set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 
@@ -0,0 +1,92 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import pytest
+
+from executorch.backends.arm.test import common, conftest
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.examples.models import deeplab_v3
+
+
+class TestDl3(unittest.TestCase):
+    """Tests DeepLabv3."""
+
+    dl3 = deeplab_v3.DeepLabV3ResNet50Model()
+    model_inputs = dl3.get_example_inputs()
+    dl3 = dl3.get_eager_model()
+
+    @unittest.expectedFailure
+    def test_dl3_tosa_MI(self):
+        (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .run_method_and_compare_outputs(self.model_inputs)
+        )
+
+    @unittest.expectedFailure
+    def test_dl3_tosa_BI(self):
+        (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .run_method_and_compare_outputs(atol=1.0, qtol=1, inputs=self.model_inputs)
+        )
+
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @unittest.skip
+    def test_dl3_u55_BI(self):
+        tester = (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs
+            )
+
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @unittest.skip
+    def test_dl3_u85_BI(self):
+        tester = (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_u85_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs
+            )
Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,8 @@ target_include_directories(`
`134`	`134`	`coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util`
`135`	`135`	`)`
`136`	`136`	`target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)`
	`137`	`+target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)`
	`138`	`+target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)`
`137`	`139`	`target_link_libraries(coremldelegate PRIVATE executorch_core)`
`138`	`140`
`139`	`141`	`if(EXECUTORCH_BUILD_DEVTOOLS)`