Skip to content

Commit 53f8a14

Browse files
committed
Update on "[Executorch] Refactor op_mul's broadcasting utils"
Summary: Refactoring broadcast handling utils that were added for op_mul. This is in prepartion use these utils to handle broadcast for other ops such as add, sub, div. Plus remove a redundant test Test Plan: optimized_kernels_test in CI Reviewers: Subscribers: Tasks: Tags: cc larryliu0820 manuelcandales Differential Revision: [D69491816](https://our.internmc.facebook.com/intern/diff/D69491816) [ghstack-poisoned]
2 parents 3029ca6 + 77eb1f3 commit 53f8a14

File tree

88 files changed

+3965
-1789
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+3965
-1789
lines changed

.ci/scripts/test_qnn_static_llama.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
# Copyright (c) Qualcomm Innovation Center, Inc.
3+
# All rights reserved
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
13+
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
14+
export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
15+
export PYTHONPATH=".."
16+
cp schema/program.fbs exir/_serialize/program.fbs
17+
cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
18+
cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
19+
cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
20+
21+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
22+
PYTHON_EXECUTABLE=python3
23+
fi
24+
25+
which "${PYTHON_EXECUTABLE}"
26+
27+
# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py
28+
pip install graphviz
29+
30+
# Download stories llama110m artifacts
31+
download_stories_model_artifacts
32+
echo "Creating tokenizer.bin"
33+
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
34+
35+
set +e
36+
# Compile only as weight sharing is not applicable on x86
37+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
38+
exit_code1=$?
39+
40+
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
41+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
42+
exit_code2=$?
43+
44+
# Check the exit codes and print messages
45+
if [ $exit_code1 -ne 0 ]; then
46+
echo "Static Llama compile only with weight sharing test failed. $exit_code1."
47+
fi
48+
49+
if [ $exit_code2 -ne 0 ]; then
50+
echo "Static Llama accuracy test failed. $exit_code2."
51+
fi
52+
53+
# Return failure if either program failed
54+
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
55+
exit 1
56+
else
57+
exit 0
58+
fi
59+
set -e

.github/workflows/android-perf.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ jobs:
222222
--preq_mode 8da4w_output_8da8w \
223223
--preq_group_size 32 \
224224
--max_seq_length 2048 \
225+
--max_context_length 2048 \
225226
--output_name "${OUT_ET_MODEL_NAME}.pte" \
226227
-kv \
227228
-d fp32 \
@@ -253,6 +254,7 @@ jobs:
253254
--xnnpack-extended-ops \
254255
-d fp32 \
255256
--max_seq_length 2048 \
257+
--max_context_length 2048 \
256258
--output_name "${OUT_ET_MODEL_NAME}.pte" \
257259
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
258260
ls -lh "${OUT_ET_MODEL_NAME}.pte"

.github/workflows/apple-perf.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ jobs:
233233
--preq_mode 8da4w_output_8da8w \
234234
--preq_group_size 32 \
235235
--max_seq_length 2048 \
236+
--max_context_length 2048 \
236237
--output_name "${OUT_ET_MODEL_NAME}.pte" \
237238
-kv \
238239
-d fp32 \
@@ -264,6 +265,7 @@ jobs:
264265
--xnnpack-extended-ops \
265266
-d fp32 \
266267
--max_seq_length 2048 \
268+
--max_context_length 2048 \
267269
--output_name "${OUT_ET_MODEL_NAME}.pte" \
268270
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
269271
ls -lh "${OUT_ET_MODEL_NAME}.pte"

.github/workflows/pull.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,39 @@ jobs:
437437
# Test llama2
438438
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
439439
440+
test-static-llama-qnn-linux:
441+
name: test-static-llama-qnn-linux
442+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
443+
permissions:
444+
id-token: write
445+
contents: read
446+
strategy:
447+
fail-fast: false
448+
with:
449+
runner: linux.2xlarge
450+
docker-image: executorch-ubuntu-22.04-qnn-sdk
451+
submodules: 'true'
452+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
453+
timeout: 180
454+
script: |
455+
# The generic Linux job chooses to use base env, not the one setup by the image
456+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
457+
conda activate "${CONDA_ENV}"
458+
459+
BUILD_TOOL="cmake"
460+
461+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
462+
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
463+
464+
# Setup executorch
465+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
466+
467+
# Setup install_requirements for llama
468+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
469+
470+
# Test static llama weight sharing and accuracy
471+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
472+
440473
test-qnn-models-linux:
441474
name: test-qnn-models-linux
442475
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main

.lintrunner.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ exclude_patterns = [
7878
# File contains @generated
7979
'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
8080
'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
81+
# Want to be able to keep c10 in sync with PyTorch core.
82+
'runtime/core/portable_type/c10/**',
8183
]
8284
command = [
8385
'python',
@@ -261,6 +263,8 @@ exclude_patterns = [
261263
'extension/**',
262264
'kernels/optimized/**',
263265
'runtime/core/exec_aten/**',
266+
# Want to be able to keep c10 in sync with PyTorch core.
267+
'runtime/core/portable_type/c10/**',
264268
'runtime/executor/tensor_parser_aten.cpp',
265269
'scripts/**',
266270
'test/**',

CMakeLists.txt

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
373373
"fix for this restriction."
374374
)
375375
endif()
376-
set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/..)
376+
set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
377377

378378
#
379379
# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -556,6 +556,7 @@ endif()
556556
target_include_directories(
557557
executorch_core PUBLIC ${_common_include_directories}
558558
)
559+
target_compile_definitions(executorch_core PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
559560
target_compile_options(executorch_core PUBLIC ${_common_compile_options})
560561
if(MAX_KERNEL_NUM)
561562
target_compile_definitions(
@@ -576,6 +577,7 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE)
576577
target_include_directories(
577578
executorch_core_shared PUBLIC ${_common_include_directories}
578579
)
580+
target_compile_definitions(executorch_core_shared PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
579581
target_compile_options(
580582
executorch_core_shared PUBLIC ${_common_compile_options}
581583
)
@@ -594,8 +596,9 @@ endif()
594596
# any backends.
595597
#
596598
add_library(executorch ${_executorch__srcs})
597-
target_link_libraries(executorch PRIVATE executorch_core)
599+
target_link_libraries(executorch PUBLIC executorch_core)
598600
target_include_directories(executorch PUBLIC ${_common_include_directories})
601+
target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
599602
target_compile_options(executorch PUBLIC ${_common_compile_options})
600603
target_link_options_shared_lib(executorch)
601604

@@ -629,6 +632,12 @@ endif()
629632

630633
# Install `executorch` library as well as `executorch-config.cmake` under
631634
# ${CMAKE_INSTALL_PREFIX}/
635+
install(DIRECTORY runtime/core/ DESTINATION include/executorch/runtime/core FILES_MATCHING PATTERN "*.h")
636+
install(DIRECTORY runtime/kernel/ DESTINATION include/executorch/runtime/kernel FILES_MATCHING PATTERN "*.h")
637+
install(DIRECTORY runtime/platform/ DESTINATION include/executorch/runtime/platform FILES_MATCHING PATTERN "*.h")
638+
install(DIRECTORY extension/kernel_util/ DESTINATION include/executorch/extension/kernel_util FILES_MATCHING PATTERN "*.h")
639+
install(DIRECTORY extension/tensor/ DESTINATION include/executorch/extension/tensor FILES_MATCHING PATTERN "*.h")
640+
install(DIRECTORY extension/threadpool/ DESTINATION include/executorch/extension/threadpool FILES_MATCHING PATTERN "*.h")
632641
install(
633642
TARGETS executorch executorch_core
634643
DESTINATION lib
@@ -792,6 +801,8 @@ if(EXECUTORCH_BUILD_PYBIND)
792801
target_include_directories(
793802
util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
794803
)
804+
target_compile_definitions(util PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
805+
795806
target_compile_options(util PUBLIC ${_pybind_compile_options})
796807
target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)
797808

backends/apple/coreml/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ target_include_directories(
134134
coremldelegate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
135135
)
136136
target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..)
137+
target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/runtime/core/portable_type)
138+
target_compile_definitions(coremldelegate PRIVATE C10_USING_CUSTOM_GENERATED_MACROS)
137139
target_link_libraries(coremldelegate PRIVATE executorch_core)
138140

139141
if(EXECUTORCH_BUILD_DEVTOOLS)

backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,7 @@
830830
GCC_OPTIMIZATION_LEVEL = 0;
831831
GCC_PREPROCESSOR_DEFINITIONS = (
832832
"DEBUG=1",
833+
"C10_USING_CUSTOM_GENERATED_MACROS",
833834
"$(inherited)",
834835
);
835836
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
@@ -911,6 +912,7 @@
911912
DEVELOPMENT_TEAM = "";
912913
GCC_PREPROCESSOR_DEFINITIONS = (
913914
"DEBUG=1",
915+
"C10_USING_CUSTOM_GENERATED_MACROS",
914916
"ET_EVENT_TRACER_ENABLED=1",
915917
"$(inherited)",
916918
);
@@ -920,6 +922,7 @@
920922
"$(SRCROOT)/../kvstore",
921923
"$(SRCROOT)/../inmemoryfs",
922924
"$(SRCROOT)/../include",
925+
"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
923926
"$(SRCROOT)/../sdk",
924927
"$(SRCROOT)/../util",
925928
"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
@@ -951,6 +954,7 @@
951954
"$(SRCROOT)/../kvstore",
952955
"$(SRCROOT)/../inmemoryfs",
953956
"$(SRCROOT)/../include",
957+
"$(SRCROOT)/../include/executorch/runtime/core/portable_type",
954958
"$(SRCROOT)/../sdk",
955959
"$(SRCROOT)/../util",
956960
"$(SRCROOT)/../../third-party/nlohmann_json/single_include",

backends/arm/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ endif()
1414

1515
include(${EXECUTORCH_ROOT}/build/Utils.cmake)
1616

17-
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
17+
set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type)
18+
add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
1819

1920
# Third-party folder and Ethos-U driver inclued
2021
set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import unittest
7+
8+
import pytest
9+
10+
from executorch.backends.arm.test import common, conftest
11+
12+
from executorch.backends.arm.test.tester.arm_tester import ArmTester
13+
from executorch.examples.models import deeplab_v3
14+
15+
16+
class TestDl3(unittest.TestCase):
17+
"""Tests DeepLabv3."""
18+
19+
dl3 = deeplab_v3.DeepLabV3ResNet50Model()
20+
model_inputs = dl3.get_example_inputs()
21+
dl3 = dl3.get_eager_model()
22+
23+
@unittest.expectedFailure
24+
def test_dl3_tosa_MI(self):
25+
(
26+
ArmTester(
27+
self.dl3,
28+
example_inputs=self.model_inputs,
29+
compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
30+
)
31+
.export()
32+
.to_edge_transform_and_lower()
33+
.to_executorch()
34+
.run_method_and_compare_outputs(self.model_inputs)
35+
)
36+
37+
@unittest.expectedFailure
38+
def test_dl3_tosa_BI(self):
39+
(
40+
ArmTester(
41+
self.dl3,
42+
example_inputs=self.model_inputs,
43+
compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
44+
)
45+
.quantize()
46+
.export()
47+
.to_edge_transform_and_lower()
48+
.to_executorch()
49+
.run_method_and_compare_outputs(atol=1.0, qtol=1, inputs=self.model_inputs)
50+
)
51+
52+
@pytest.mark.slow
53+
@pytest.mark.corstone_fvp
54+
@unittest.skip
55+
def test_dl3_u55_BI(self):
56+
tester = (
57+
ArmTester(
58+
self.dl3,
59+
example_inputs=self.model_inputs,
60+
compile_spec=common.get_u55_compile_spec(),
61+
)
62+
.quantize()
63+
.export()
64+
.to_edge_transform_and_lower()
65+
.to_executorch()
66+
.serialize()
67+
)
68+
if conftest.is_option_enabled("corstone_fvp"):
69+
tester.run_method_and_compare_outputs(
70+
atol=1.0, qtol=1, inputs=self.model_inputs
71+
)
72+
73+
@pytest.mark.slow
74+
@pytest.mark.corstone_fvp
75+
@unittest.skip
76+
def test_dl3_u85_BI(self):
77+
tester = (
78+
ArmTester(
79+
self.dl3,
80+
example_inputs=self.model_inputs,
81+
compile_spec=common.get_u85_compile_spec(),
82+
)
83+
.quantize()
84+
.export()
85+
.to_edge_transform_and_lower()
86+
.to_executorch()
87+
.serialize()
88+
)
89+
if conftest.is_option_enabled("corstone_fvp"):
90+
tester.run_method_and_compare_outputs(
91+
atol=1.0, qtol=1, inputs=self.model_inputs
92+
)

0 commit comments

Comments
 (0)