Skip to content

Commit ff8f487

Browse files
committed
Update base for Update on "Move optimized target definitions to op_registration.bzl"
^ So we can pull these definitions into codegen.bzl (we can't pull in targets.bzl files). Differential Revision: [D74741846](https://our.internmc.facebook.com/intern/diff/D74741846/) [ghstack-poisoned]
2 parents 9ded0a2 + d0848ca commit ff8f487

File tree

180 files changed

+4535
-2049
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

180 files changed

+4535
-2049
lines changed

.buckconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040
[buck2]
4141
restarter=true
42+
file_watcher=notify
4243

4344
[oss]
4445
folly_cxx_tests = False

.ci/scripts/build-qnn-sdk.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@ set_up_aot() {
3939
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
4040
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
4141
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
42-
-DPYTHON_EXECUTABLE=python3 \
43-
-DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
42+
-DPYTHON_EXECUTABLE=python3
4443
cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
4544
# install Python APIs to correct import path
4645
# The filename might vary depending on your Python and host version.

.github/workflows/trunk.yml

Lines changed: 68 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -555,11 +555,11 @@ jobs:
555555
strategy:
556556
matrix:
557557
hf_model_id: [
558-
google/gemma-2-2b,
559-
Qwen/Qwen2.5-0.5B,
558+
google/gemma-3-1b-it,
559+
Qwen/Qwen3-0.6B,
560560
HuggingFaceTB/SmolLM2-135M,
561561
meta-llama/Llama-3.2-1B,
562-
allenai/OLMo-1B-hf
562+
allenai/OLMo-1B-hf,
563563
]
564564
fail-fast: false
565565
with:
@@ -569,44 +569,102 @@ jobs:
569569
submodules: 'recursive'
570570
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
571571
timeout: 90
572+
upload-artifact: profiling-artifacts-${{ strategy.job-index }}
572573
script: |
573574
echo "::group::Set up ExecuTorch"
574575
# The generic Linux job chooses to use base env, not the one setup by the image
575576
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
576577
conda activate "${CONDA_ENV}"
577578
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
579+
# Build executor_runner with ETdump enabled
580+
PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
581+
-DCMAKE_INSTALL_PREFIX=cmake-out \
582+
-DEXECUTORCH_ENABLE_LOGGING=1 \
583+
-DCMAKE_BUILD_TYPE=Release \
584+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
585+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
586+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
587+
-DEXECUTORCH_BUILD_XNNPACK=ON \
588+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
589+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
590+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
591+
-DEXECUTORCH_BUILD_DEVTOOLS=ON \
592+
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
593+
-Bcmake-out .
594+
cmake --build cmake-out -j16 --target install --config Release
578595
echo "::endgroup::"
579596
580597
echo "::group::Set up Hugging Face"
581598
pip install -U "huggingface_hub[cli]"
582599
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
583600
git clone https://github.com/huggingface/optimum-executorch
584-
cd optimum-executorch
601+
pushd optimum-executorch
585602
# There is no release yet, for CI stability, always test from the same commit on main
586-
git checkout 577a2b19670e4c643a5c6ecb09bf47b9a699e7c6
603+
git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8
587604
pip install .[tests]
605+
popd
606+
607+
if [ "${{ matrix.hf_model_id }}" == "google/gemma-3-1b-it" ]; then
608+
# Fixes for gemma-3 is not available in the released version
609+
git clone https://github.com/huggingface/transformers.git
610+
pushd transformers
611+
git checkout a57274466f7f72efaa2662d1738cdaf28ae8071f
612+
pip install -e .
613+
popd
614+
fi
588615
pip list
589616
echo "::endgroup::"
590617
591-
echo "::group::Export and Run ${{ matrix.hf_model_id }}"
618+
echo "::group::Export to ExecuTorch"
592619
# Pass matrix variable as environment variable
593620
export MODEL_ID="${{ matrix.hf_model_id }}"
621+
export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_8da4w"
622+
pushd optimum-executorch
623+
624+
optimum-cli export executorch \
625+
--model ${MODEL_ID} \
626+
--task text-generation \
627+
--recipe xnnpack \
628+
--use_custom_sdpa \
629+
--output_dir ${OUTPUT_DIR} \
630+
--qlinear
631+
632+
ls -FlAGhp ${OUTPUT_DIR}
633+
popd
634+
echo "::endgroup::"
635+
636+
echo "::group::Inference using python API"
637+
pushd optimum-executorch
594638
python -c "
595639
import os
596640
from optimum.executorch import ExecuTorchModelForCausalLM
597641
from transformers import AutoTokenizer
598642
599643
model_id = os.getenv('MODEL_ID')
600-
print(f'Loading model: {model_id}')
601-
model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
602-
tokenizer = AutoTokenizer.from_pretrained(model_id)
644+
pte_dir = os.getenv('OUTPUT_DIR')
645+
print(f'Loading model {model_id} from {pte_dir}.')
646+
model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
603647
generated_text = model.text_generation(
604-
tokenizer=tokenizer,
648+
tokenizer=AutoTokenizer.from_pretrained(model_id),
605649
prompt='Simply put, the theory of relativity states that',
606650
max_seq_len=64
607651
)
608652
print(generated_text)
609653
"
654+
popd
655+
echo "::endgroup::"
656+
657+
echo "::group::Inference using executor_runner with ETDump"
658+
./cmake-out/executor_runner \
659+
--model_path ${OUTPUT_DIR}/model.pte \
660+
--etdump_path ${OUTPUT_DIR}/etdump.etdp
661+
662+
export TSV_PATH=artifacts-to-be-uploaded/${MODEL_ID}_op_prof.tsv
663+
mkdir -p $(dirname "$TSV_PATH")
664+
python3 -m devtools.inspector.inspector_cli \
665+
--etdump_path ${OUTPUT_DIR}/etdump.etdp \
666+
--tsv_path ${TSV_PATH}
667+
610668
echo "::endgroup::"
611669
612670

CMakeLists.txt

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,21 +48,33 @@ project(executorch)
4848
# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
4949

5050
include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
51+
include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
52+
include(CMakeDependentOption)
53+
include(ExternalProject)
5154

5255
if(NOT CMAKE_CXX_STANDARD)
5356
set(CMAKE_CXX_STANDARD 17)
5457
endif()
5558
announce_configured_options(CMAKE_CXX_STANDARD)
5659

60+
if(NOT CMAKE_SYSTEM_PROCESSOR)
61+
set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
62+
endif()
63+
announce_configured_options(CMAKE_SYSTEM_PROCESSOR)
64+
5765
if(NOT CMAKE_BUILD_TYPE)
5866
set(CMAKE_BUILD_TYPE Debug)
5967
endif()
6068
announce_configured_options(CMAKE_BUILD_TYPE)
6169

70+
if(NOT PYTHON_EXECUTABLE)
71+
resolve_python_executable()
72+
endif()
73+
announce_configured_options(PYTHON_EXECUTABLE)
74+
6275
announce_configured_options(CMAKE_CXX_COMPILER_ID)
6376
announce_configured_options(CMAKE_TOOLCHAIN_FILE)
6477
announce_configured_options(BUCK2)
65-
announce_configured_options(PYTHON_EXECUTABLE)
6678

6779
load_build_preset()
6880
include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
@@ -72,10 +84,6 @@ print_configured_options()
7284

7385
# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
7486

75-
include(tools/cmake/Utils.cmake)
76-
include(CMakeDependentOption)
77-
include(ExternalProject)
78-
7987
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
8088

8189
# Setup RPATH.
@@ -251,11 +259,6 @@ if(EXECUTORCH_BUILD_TESTS)
251259
include(CTest)
252260
endif()
253261

254-
if(NOT PYTHON_EXECUTABLE)
255-
resolve_python_executable()
256-
endif()
257-
message(STATUS "Using python executable '${PYTHON_EXECUTABLE}'")
258-
259262
# TODO(dbort): Fix these warnings and remove this flag.
260263
set(_common_compile_options -Wno-deprecated-declarations -fPIC)
261264

backends/apple/mps/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@ endif()
1818

1919
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2020

21-
if(NOT PYTHON_EXECUTABLE)
22-
resolve_python_executable()
23-
endif()
24-
2521
set(_common_compile_options -Wno-deprecated-declarations)
2622
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
2723

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder # noqa
99
from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass # noqa
1010
from .arm_pass import ArmPass # noqa
11+
from .broadcast_args_pass import BroadcastArgsPass # noqa
1112
from .cast_int64_pass import CastInt64BuffersToInt32Pass # noqa
1213
from .cast_to_int32_pass import CastToInt32Pass # noqa
1314
from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass # noqa
@@ -24,6 +25,7 @@
2425
from .decompose_gelu_pass import DecomposeGeluPass # noqa
2526
from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa
2627
from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass # noqa
28+
from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass # noqa
2729
from .decompose_linear_pass import DecomposeLinearPass # noqa
2830
from .decompose_meandim_pass import DecomposeMeanDimPass # noqa
2931
from .decompose_ne_pass import DecomposeNotEqualPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from executorch.backends.arm._passes import (
1111
AnnotateChannelsLastDimOrder,
1212
AnnotateDecomposedMatmulPass,
13+
BroadcastArgsPass,
1314
CastInt64BuffersToInt32Pass,
1415
CastToInt32Pass,
1516
ComputeConstantOpsAOT,
@@ -29,6 +30,7 @@
2930
DecomposeLayerNormPass,
3031
DecomposeLeakyReLUPass,
3132
DecomposeLinearPass,
33+
DecomposeLinearVectorNormPass,
3234
DecomposeMeanDimPass,
3335
DecomposeNotEqualPass,
3436
DecomposeSelectPass,
@@ -59,7 +61,7 @@
5961
UnsqueezeScalarPlaceholdersPass,
6062
)
6163

62-
from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
64+
from executorch.backends.arm.tosa_specification import TosaSpecification
6365
from executorch.backends.transforms.decompose_sdpa import (
6466
DecomposeScaledDotProductAttention,
6567
)
@@ -86,13 +88,14 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
8688
self.add_pass(ConvertSplitToSlicePass())
8789
self.add_pass(ConvertMmToBmmPass())
8890
self.add_pass(DecomposeLinearPass())
91+
self.add_pass(DecomposeLinearVectorNormPass())
8992
self.add_pass(DecomposeMeanDimPass())
9093
self.add_pass(ConvertFullLikeToFullPass())
9194
self.add_pass(ConvertToClampPass())
9295
self.add_pass(ConvertMinMaxPass())
9396
self.add_pass(ConvertAnyDefaultDimDimsPass())
9497
self.add_pass(MatchWhereSelfDtypePass())
95-
if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
98+
if self.tosa_spec.is_U55_subset:
9699
self.add_pass(CastToInt32Pass())
97100

98101
self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
@@ -102,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
102105
self.add_pass(RetraceFoldedDtypesPass())
103106
self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
104107
self.add_pass(MatchArgRanksPass(exported_program))
108+
if self.tosa_spec.is_U55_subset:
109+
self.add_pass(BroadcastArgsPass())
105110
self.add_pass(ComputeConstantOpsAOT(exported_program))
106111

107112
self.add_pass(RemoveClonePass())
@@ -133,6 +138,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
133138
self.add_pass(FuseBatchnorm2DPass(exported_program))
134139
self.add_pass(ConvertMmToBmmPass())
135140
self.add_pass(DecomposeLinearPass())
141+
self.add_pass(DecomposeLinearVectorNormPass())
136142
self.add_pass(DecomposeLeakyReLUPass())
137143
self.add_pass(DecomposeBatchNormPass())
138144
self.add_pass(DecomposeLayerNormPass())
@@ -207,10 +213,11 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
207213
self.add_pass(DecomposeCosineSimilarityPass())
208214
self.add_pass(DecomposeDivPass())
209215
self.add_pass(DecomposeLeakyReLUPass())
216+
self.add_pass(DecomposeLinearVectorNormPass())
210217
self.add_pass(DecomposeSqrtPass())
211218
self.add_pass(DecomposeSiluPass())
212219

213-
if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
220+
if self.tosa_spec.is_U55_subset:
214221
# Numerically stable softmax uses amax which is not supported on Ethos-U55
215222
self.add_pass(DecomposeSoftmaxUnstablePass())
216223
else:
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from executorch.backends.arm._passes import ArmPass
7+
8+
from executorch.backends.arm._passes.arm_pass_utils import (
9+
create_node,
10+
get_first_fake_tensor,
11+
)
12+
13+
from executorch.exir.dialects._ops import ops as exir_ops
14+
15+
from executorch.exir.pass_base import PassResult
16+
from torch.fx import GraphModule, Node
17+
18+
19+
class BroadcastArgsPass(ArmPass):
20+
"""
21+
Pass to manually broadcast arguments by inserting repeats.
22+
This is done when more than one arg needs broadcasting.
23+
"""
24+
25+
targeted_ops = {
26+
exir_ops.edge.aten.add.Tensor,
27+
exir_ops.edge.aten.sub.Tensor,
28+
# mul is indirectly targeting div as div is decompsed to reciprocal + mul
29+
exir_ops.edge.aten.mul.Tensor,
30+
}
31+
32+
def call(self, graph_module: GraphModule) -> PassResult:
33+
for node in graph_module.graph.nodes:
34+
if node.op != "call_function" or node.target not in self.targeted_ops:
35+
continue
36+
37+
output_shape = get_first_fake_tensor(node).shape
38+
nbr_of_broacasts = 0
39+
for arg in node.args:
40+
if not isinstance(arg, Node):
41+
continue
42+
43+
shape = get_first_fake_tensor(arg).shape
44+
if shape != output_shape:
45+
nbr_of_broacasts += 1
46+
if nbr_of_broacasts > 1:
47+
multiples = [
48+
int(output_shape[d] / shape[d])
49+
for d in range(len(output_shape))
50+
]
51+
with graph_module.graph.inserting_before(node):
52+
repeat = create_node(
53+
graph_module.graph,
54+
exir_ops.edge.aten.repeat.default,
55+
args=(arg, multiples),
56+
kwargs={},
57+
from_node=node,
58+
)
59+
node.replace_input_with(arg, repeat)
60+
61+
graph_module.recompile()
62+
graph_module = super().call(graph_module).graph_module
63+
return PassResult(graph_module, True)

0 commit comments

Comments
 (0)