Skip to content

Commit 48d06e4

Browse files
committed
Update base for Update on "Dtype selective build for optimized ops"
Add dtype selective build for optimized ops. Follows the same process as portable, where we copy the source files and rebuild the library. 1. Generalize copy genrule for portable/optimized/source/header. 2. Copy optimized source files + headers. 3. Build optimized ops using source files, dependencies, portable header. 4. Add test, confirm that we can run addmul with float dtypes (when we remove, the test fails). Differential Revision: [D74688554](https://our.internmc.facebook.com/intern/diff/D74688554/) [ghstack-poisoned]
2 parents 77c6fb0 + d0848ca commit 48d06e4

File tree

180 files changed

+4656
-2163
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

180 files changed

+4656
-2163
lines changed

.buckconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040
[buck2]
4141
restarter=true
42+
file_watcher=notify
4243

4344
[oss]
4445
folly_cxx_tests = False

.ci/scripts/build-qnn-sdk.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,7 @@ set_up_aot() {
3939
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
4040
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
4141
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
42-
-DPYTHON_EXECUTABLE=python3 \
43-
-DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
42+
-DPYTHON_EXECUTABLE=python3
4443
cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
4544
# install Python APIs to correct import path
4645
# The filename might vary depending on your Python and host version.

.github/workflows/trunk.yml

Lines changed: 68 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -555,11 +555,11 @@ jobs:
555555
strategy:
556556
matrix:
557557
hf_model_id: [
558-
google/gemma-2-2b,
559-
Qwen/Qwen2.5-0.5B,
558+
google/gemma-3-1b-it,
559+
Qwen/Qwen3-0.6B,
560560
HuggingFaceTB/SmolLM2-135M,
561561
meta-llama/Llama-3.2-1B,
562-
allenai/OLMo-1B-hf
562+
allenai/OLMo-1B-hf,
563563
]
564564
fail-fast: false
565565
with:
@@ -569,44 +569,102 @@ jobs:
569569
submodules: 'recursive'
570570
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
571571
timeout: 90
572+
upload-artifact: profiling-artifacts-${{ strategy.job-index }}
572573
script: |
573574
echo "::group::Set up ExecuTorch"
574575
# The generic Linux job chooses to use base env, not the one setup by the image
575576
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
576577
conda activate "${CONDA_ENV}"
577578
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
579+
# Build executor_runner with ETdump enabled
580+
PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
581+
-DCMAKE_INSTALL_PREFIX=cmake-out \
582+
-DEXECUTORCH_ENABLE_LOGGING=1 \
583+
-DCMAKE_BUILD_TYPE=Release \
584+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
585+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
586+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
587+
-DEXECUTORCH_BUILD_XNNPACK=ON \
588+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
589+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
590+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
591+
-DEXECUTORCH_BUILD_DEVTOOLS=ON \
592+
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
593+
-Bcmake-out .
594+
cmake --build cmake-out -j16 --target install --config Release
578595
echo "::endgroup::"
579596
580597
echo "::group::Set up Hugging Face"
581598
pip install -U "huggingface_hub[cli]"
582599
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
583600
git clone https://github.com/huggingface/optimum-executorch
584-
cd optimum-executorch
601+
pushd optimum-executorch
585602
# There is no release yet, for CI stability, always test from the same commit on main
586-
git checkout 577a2b19670e4c643a5c6ecb09bf47b9a699e7c6
603+
git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8
587604
pip install .[tests]
605+
popd
606+
607+
if [ "${{ matrix.hf_model_id }}" == "google/gemma-3-1b-it" ]; then
608+
# Fixes for gemma-3 is not available in the released version
609+
git clone https://github.com/huggingface/transformers.git
610+
pushd transformers
611+
git checkout a57274466f7f72efaa2662d1738cdaf28ae8071f
612+
pip install -e .
613+
popd
614+
fi
588615
pip list
589616
echo "::endgroup::"
590617
591-
echo "::group::Export and Run ${{ matrix.hf_model_id }}"
618+
echo "::group::Export to ExecuTorch"
592619
# Pass matrix variable as environment variable
593620
export MODEL_ID="${{ matrix.hf_model_id }}"
621+
export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_8da4w"
622+
pushd optimum-executorch
623+
624+
optimum-cli export executorch \
625+
--model ${MODEL_ID} \
626+
--task text-generation \
627+
--recipe xnnpack \
628+
--use_custom_sdpa \
629+
--output_dir ${OUTPUT_DIR} \
630+
--qlinear
631+
632+
ls -FlAGhp ${OUTPUT_DIR}
633+
popd
634+
echo "::endgroup::"
635+
636+
echo "::group::Inference using python API"
637+
pushd optimum-executorch
594638
python -c "
595639
import os
596640
from optimum.executorch import ExecuTorchModelForCausalLM
597641
from transformers import AutoTokenizer
598642
599643
model_id = os.getenv('MODEL_ID')
600-
print(f'Loading model: {model_id}')
601-
model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
602-
tokenizer = AutoTokenizer.from_pretrained(model_id)
644+
pte_dir = os.getenv('OUTPUT_DIR')
645+
print(f'Loading model {model_id} from {pte_dir}.')
646+
model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
603647
generated_text = model.text_generation(
604-
tokenizer=tokenizer,
648+
tokenizer=AutoTokenizer.from_pretrained(model_id),
605649
prompt='Simply put, the theory of relativity states that',
606650
max_seq_len=64
607651
)
608652
print(generated_text)
609653
"
654+
popd
655+
echo "::endgroup::"
656+
657+
echo "::group::Inference using executor_runner with ETDump"
658+
./cmake-out/executor_runner \
659+
--model_path ${OUTPUT_DIR}/model.pte \
660+
--etdump_path ${OUTPUT_DIR}/etdump.etdp
661+
662+
export TSV_PATH=artifacts-to-be-uploaded/${MODEL_ID}_op_prof.tsv
663+
mkdir -p $(dirname "$TSV_PATH")
664+
python3 -m devtools.inspector.inspector_cli \
665+
--etdump_path ${OUTPUT_DIR}/etdump.etdp \
666+
--tsv_path ${TSV_PATH}
667+
610668
echo "::endgroup::"
611669
612670

CMakeLists.txt

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,21 +48,33 @@ project(executorch)
4848
# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
4949

5050
include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
51+
include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
52+
include(CMakeDependentOption)
53+
include(ExternalProject)
5154

5255
if(NOT CMAKE_CXX_STANDARD)
5356
set(CMAKE_CXX_STANDARD 17)
5457
endif()
5558
announce_configured_options(CMAKE_CXX_STANDARD)
5659

60+
if(NOT CMAKE_SYSTEM_PROCESSOR)
61+
set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
62+
endif()
63+
announce_configured_options(CMAKE_SYSTEM_PROCESSOR)
64+
5765
if(NOT CMAKE_BUILD_TYPE)
5866
set(CMAKE_BUILD_TYPE Debug)
5967
endif()
6068
announce_configured_options(CMAKE_BUILD_TYPE)
6169

70+
if(NOT PYTHON_EXECUTABLE)
71+
resolve_python_executable()
72+
endif()
73+
announce_configured_options(PYTHON_EXECUTABLE)
74+
6275
announce_configured_options(CMAKE_CXX_COMPILER_ID)
6376
announce_configured_options(CMAKE_TOOLCHAIN_FILE)
6477
announce_configured_options(BUCK2)
65-
announce_configured_options(PYTHON_EXECUTABLE)
6678

6779
load_build_preset()
6880
include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
@@ -72,10 +84,6 @@ print_configured_options()
7284

7385
# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
7486

75-
include(tools/cmake/Utils.cmake)
76-
include(CMakeDependentOption)
77-
include(ExternalProject)
78-
7987
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
8088

8189
# Setup RPATH.
@@ -251,11 +259,6 @@ if(EXECUTORCH_BUILD_TESTS)
251259
include(CTest)
252260
endif()
253261

254-
if(NOT PYTHON_EXECUTABLE)
255-
resolve_python_executable()
256-
endif()
257-
message(STATUS "Using python executable '${PYTHON_EXECUTABLE}'")
258-
259262
# TODO(dbort): Fix these warnings and remove this flag.
260263
set(_common_compile_options -Wno-deprecated-declarations -fPIC)
261264

backends/apple/mps/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@ endif()
1818

1919
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2020

21-
if(NOT PYTHON_EXECUTABLE)
22-
resolve_python_executable()
23-
endif()
24-
2521
set(_common_compile_options -Wno-deprecated-declarations)
2622
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
2723

backends/arm/_passes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder # noqa
99
from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass # noqa
1010
from .arm_pass import ArmPass # noqa
11+
from .broadcast_args_pass import BroadcastArgsPass # noqa
1112
from .cast_int64_pass import CastInt64BuffersToInt32Pass # noqa
1213
from .cast_to_int32_pass import CastToInt32Pass # noqa
1314
from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass # noqa
@@ -24,6 +25,7 @@
2425
from .decompose_gelu_pass import DecomposeGeluPass # noqa
2526
from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa
2627
from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass # noqa
28+
from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass # noqa
2729
from .decompose_linear_pass import DecomposeLinearPass # noqa
2830
from .decompose_meandim_pass import DecomposeMeanDimPass # noqa
2931
from .decompose_ne_pass import DecomposeNotEqualPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from executorch.backends.arm._passes import (
1111
AnnotateChannelsLastDimOrder,
1212
AnnotateDecomposedMatmulPass,
13+
BroadcastArgsPass,
1314
CastInt64BuffersToInt32Pass,
1415
CastToInt32Pass,
1516
ComputeConstantOpsAOT,
@@ -29,6 +30,7 @@
2930
DecomposeLayerNormPass,
3031
DecomposeLeakyReLUPass,
3132
DecomposeLinearPass,
33+
DecomposeLinearVectorNormPass,
3234
DecomposeMeanDimPass,
3335
DecomposeNotEqualPass,
3436
DecomposeSelectPass,
@@ -59,7 +61,7 @@
5961
UnsqueezeScalarPlaceholdersPass,
6062
)
6163

62-
from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
64+
from executorch.backends.arm.tosa_specification import TosaSpecification
6365
from executorch.backends.transforms.decompose_sdpa import (
6466
DecomposeScaledDotProductAttention,
6567
)
@@ -86,13 +88,14 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
8688
self.add_pass(ConvertSplitToSlicePass())
8789
self.add_pass(ConvertMmToBmmPass())
8890
self.add_pass(DecomposeLinearPass())
91+
self.add_pass(DecomposeLinearVectorNormPass())
8992
self.add_pass(DecomposeMeanDimPass())
9093
self.add_pass(ConvertFullLikeToFullPass())
9194
self.add_pass(ConvertToClampPass())
9295
self.add_pass(ConvertMinMaxPass())
9396
self.add_pass(ConvertAnyDefaultDimDimsPass())
9497
self.add_pass(MatchWhereSelfDtypePass())
95-
if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
98+
if self.tosa_spec.is_U55_subset:
9699
self.add_pass(CastToInt32Pass())
97100

98101
self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
@@ -102,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
102105
self.add_pass(RetraceFoldedDtypesPass())
103106
self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
104107
self.add_pass(MatchArgRanksPass(exported_program))
108+
if self.tosa_spec.is_U55_subset:
109+
self.add_pass(BroadcastArgsPass())
105110
self.add_pass(ComputeConstantOpsAOT(exported_program))
106111

107112
self.add_pass(RemoveClonePass())
@@ -133,6 +138,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
133138
self.add_pass(FuseBatchnorm2DPass(exported_program))
134139
self.add_pass(ConvertMmToBmmPass())
135140
self.add_pass(DecomposeLinearPass())
141+
self.add_pass(DecomposeLinearVectorNormPass())
136142
self.add_pass(DecomposeLeakyReLUPass())
137143
self.add_pass(DecomposeBatchNormPass())
138144
self.add_pass(DecomposeLayerNormPass())
@@ -207,10 +213,11 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
207213
self.add_pass(DecomposeCosineSimilarityPass())
208214
self.add_pass(DecomposeDivPass())
209215
self.add_pass(DecomposeLeakyReLUPass())
216+
self.add_pass(DecomposeLinearVectorNormPass())
210217
self.add_pass(DecomposeSqrtPass())
211218
self.add_pass(DecomposeSiluPass())
212219

213-
if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
220+
if self.tosa_spec.is_U55_subset:
214221
# Numerically stable softmax uses amax which is not supported on Ethos-U55
215222
self.add_pass(DecomposeSoftmaxUnstablePass())
216223
else:
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from executorch.backends.arm._passes import ArmPass
7+
8+
from executorch.backends.arm._passes.arm_pass_utils import (
9+
create_node,
10+
get_first_fake_tensor,
11+
)
12+
13+
from executorch.exir.dialects._ops import ops as exir_ops
14+
15+
from executorch.exir.pass_base import PassResult
16+
from torch.fx import GraphModule, Node
17+
18+
19+
class BroadcastArgsPass(ArmPass):
20+
"""
21+
Pass to manually broadcast arguments by inserting repeats.
22+
This is done when more than one arg needs broadcasting.
23+
"""
24+
25+
targeted_ops = {
26+
exir_ops.edge.aten.add.Tensor,
27+
exir_ops.edge.aten.sub.Tensor,
28+
# mul is indirectly targeting div as div is decompsed to reciprocal + mul
29+
exir_ops.edge.aten.mul.Tensor,
30+
}
31+
32+
def call(self, graph_module: GraphModule) -> PassResult:
33+
for node in graph_module.graph.nodes:
34+
if node.op != "call_function" or node.target not in self.targeted_ops:
35+
continue
36+
37+
output_shape = get_first_fake_tensor(node).shape
38+
nbr_of_broacasts = 0
39+
for arg in node.args:
40+
if not isinstance(arg, Node):
41+
continue
42+
43+
shape = get_first_fake_tensor(arg).shape
44+
if shape != output_shape:
45+
nbr_of_broacasts += 1
46+
if nbr_of_broacasts > 1:
47+
multiples = [
48+
int(output_shape[d] / shape[d])
49+
for d in range(len(output_shape))
50+
]
51+
with graph_module.graph.inserting_before(node):
52+
repeat = create_node(
53+
graph_module.graph,
54+
exir_ops.edge.aten.repeat.default,
55+
args=(arg, multiples),
56+
kwargs={},
57+
from_node=node,
58+
)
59+
node.replace_input_with(arg, repeat)
60+
61+
graph_module.recompile()
62+
graph_module = super().call(graph_module).graph_module
63+
return PassResult(graph_module, True)

0 commit comments

Comments
 (0)