Skip to content

Commit edca720

Browse files
author
morelos
committed
Update base for Update on "[ET-VK][Ops] aten.tan.default from scratch implementation"
Following the instructions here for creating a new operator from scratch for learning purposes: https://www.internalfb.com/wiki/ExecuTorch_Vulkan_Backend/Development_0/Adding_a_New_Operator_Implementation/ Goal is to create a tan operator and its test case Differential Revision: [D75100188](https://our.internmc.facebook.com/intern/diff/D75100188/) [ghstack-poisoned]
2 parents 2efb215 + 71275e5 commit edca720

File tree

73 files changed

+2284
-1544
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+2284
-1544
lines changed

.ci/scripts/build_llama_android.sh

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,12 @@ install_executorch_and_backend_lib() {
2222
ANDROID_NDK=/opt/ndk
2323
BUCK2=buck2
2424
ANDROID_ABI=arm64-v8a
25-
cmake -DBUCK2="${BUCK2}" \
25+
cmake --preset llm \
26+
-DBUCK2="${BUCK2}" \
2627
-DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
2728
-DANDROID_ABI="${ANDROID_ABI}" \
2829
-DCMAKE_INSTALL_PREFIX=cmake-android-out \
2930
-DCMAKE_BUILD_TYPE=Release \
30-
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
31-
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
32-
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
33-
-DEXECUTORCH_BUILD_XNNPACK=ON \
34-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
35-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
36-
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
3731
-DXNNPACK_ENABLE_ARM_BF16=OFF \
3832
-Bcmake-android-out .
3933

@@ -51,11 +45,7 @@ build_llama_runner() {
5145
-DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake \
5246
-DANDROID_ABI="${ANDROID_ABI}" \
5347
-DCMAKE_INSTALL_PREFIX=cmake-android-out \
54-
-DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \
55-
-DEXECUTORCH_BUILD_XNNPACK=ON \
56-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
57-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
58-
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
48+
-DCMAKE_BUILD_TYPE=Release \
5949
-Bcmake-android-out/examples/models/llama examples/models/llama
6050

6151
cmake --build cmake-android-out/examples/models/llama -j4 --config Release

.ci/scripts/test_llama.sh

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -152,21 +152,11 @@ which "${PYTHON_EXECUTABLE}"
152152
cmake_install_executorch_libraries() {
153153
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
154154
rm -rf cmake-out
155-
retry cmake \
155+
retry cmake --preset llm \
156156
-DCMAKE_INSTALL_PREFIX=cmake-out \
157157
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
158-
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
159-
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
160-
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
161-
-DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
162-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
163-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
164-
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
165-
-DEXECUTORCH_BUILD_MPS="$MPS" \
166-
-DEXECUTORCH_BUILD_COREML="$COREML" \
167158
-DEXECUTORCH_BUILD_QNN="$QNN" \
168159
-DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
169-
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
170160
-Bcmake-out .
171161
cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
172162
}
@@ -181,10 +171,6 @@ cmake_build_llama_runner() {
181171
retry cmake \
182172
-DCMAKE_INSTALL_PREFIX=cmake-out \
183173
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
184-
-DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
185-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
186-
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
187-
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
188174
-Bcmake-out/${dir} \
189175
${dir}
190176
cmake --build cmake-out/${dir} -j9 --config "$CMAKE_BUILD_TYPE"

.github/workflows/build-presets.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
strategy:
2121
fail-fast: false
2222
matrix:
23-
preset: [macos-arm64, pybind]
23+
preset: [macos-arm64, pybind, llm]
2424
with:
2525
job-name: build
2626
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -32,14 +32,14 @@ jobs:
3232
set -eux
3333
${CONDA_RUN} ./install_requirements.sh > /dev/null
3434
${CONDA_RUN} cmake --preset ${{ matrix.preset }}
35-
${CONDA_RUN} cmake --build cmake-out --parallel
35+
${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 ))
3636
3737
linux:
3838
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
3939
strategy:
4040
fail-fast: false
4141
matrix:
42-
preset: [pybind]
42+
preset: [pybind, llm]
4343
runner: [linux.2xlarge, linux.arm64.2xlarge]
4444
docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
4545
# Excluding specific runner + docker image combinations that don't make sense:
@@ -65,4 +65,4 @@ jobs:
6565
6666
./install_requirements.sh > /dev/null
6767
cmake --preset ${{ matrix.preset }}
68-
cmake --build cmake-out --parallel
68+
cmake --build cmake-out -j$(( $(nproc) - 1 ))

.github/workflows/trunk.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,3 +692,29 @@ jobs:
692692
build-mode: Release
693693
build-tool: cmake
694694
docker-image: executorch-ubuntu-22.04-clang12
695+
696+
unittest-nxp-neutron:
697+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
698+
permissions:
699+
id-token: write
700+
contents: read
701+
with:
702+
runner: linux.2xlarge
703+
docker-image: executorch-ubuntu-22.04-clang12
704+
submodules: 'true'
705+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
706+
timeout: 90
707+
script: |
708+
set -eux
709+
710+
# The generic Linux job chooses to use base env, not the one setup by the image
711+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
712+
conda activate "${CONDA_ENV}"
713+
714+
# Build and install Executorch
715+
PYTHON_EXECUTABLE=python \
716+
CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
717+
.ci/scripts/setup-linux.sh --build-tool "cmake"
718+
719+
# Run pytest
720+
PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh

CMakePresets.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,26 @@
3636
"string": "${hostSystemName}",
3737
"list": ["Darwin", "Linux", "Windows"]
3838
}
39+
},
40+
{
41+
"name": "llm",
42+
"displayName": "Build LLM libraries",
43+
"inherits": [
44+
"common"
45+
],
46+
"cacheVariables": {
47+
"EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/llm.cmake",
48+
"CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
49+
},
50+
"condition": {
51+
"type": "inList",
52+
"string": "${hostSystemName}",
53+
"list": [
54+
"Darwin",
55+
"Linux",
56+
"Windows"
57+
]
58+
}
3959
}
4060
]
4161
}

backends/apple/coreml/CMakeLists.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ endif()
2525

2626
option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
2727

28-
set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15)
29-
3028
# inmemoryfs sources
3129
set(INMEMORYFS_SOURCES
3230
runtime/inmemoryfs/inmemory_filesystem.cpp
@@ -240,7 +238,6 @@ if(EXECUTORCH_BUILD_COREML AND EXECUTORCH_BUILD_PYBIND)
240238

241239
pybind11_add_module(executorchcoreml SHARED runtime/inmemoryfs/inmemory_filesystem_py.cpp)
242240

243-
target_compile_options(executorchcoreml PRIVATE -mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET})
244241
if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
245242
target_compile_options(executorchcoreml PRIVATE -g)
246243
endif()

backends/arm/scripts/install_reference_model.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ tosa_reference_model_url="https://git.gitlab.arm.com/tosa/tosa-reference-model.g
1313
tosa_reference_model_0_80_branch="v0.80"
1414
tosa_reference_model_0_80_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a"
1515
tosa_serialization_lib_0_80_rev="v0.80.1"
16-
tosa_reference_model_1_0_rev="4d17b5b960cd986d8cb8052188fbe3ae494789e8"
16+
tosa_reference_model_1_0_rev="d102f426dd2e3c1f25bbf23292ec8ee51aa9c677"
1717

1818
script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
1919

backends/example/example_quantizer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,11 @@
1010
import torch
1111
from executorch.backends.example.example_operators.ops import module_to_annotator
1212
from torch import fx
13-
from torchao.quantization.pt2e.graph_utils import find_sequential_partitions
14-
from torchao.quantization.pt2e.observer import HistogramObserver, MinMaxObserver
13+
from torchao.quantization.pt2e import (
14+
find_sequential_partitions,
15+
HistogramObserver,
16+
MinMaxObserver,
17+
)
1518
from torchao.quantization.pt2e.quantizer import (
1619
OperatorConfig,
1720
QuantizationSpec,

backends/mediatek/quantizer/qconfig.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,11 @@
1010

1111
import torch
1212

13-
from torchao.quantization.pt2e.fake_quantize import FakeQuantize
14-
from torchao.quantization.pt2e.observer import MinMaxObserver, PerChannelMinMaxObserver
13+
from torchao.quantization.pt2e import (
14+
FakeQuantize,
15+
MinMaxObserver,
16+
PerChannelMinMaxObserver,
17+
)
1518
from torchao.quantization.pt2e.quantizer import QuantizationSpec
1619

1720

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Copyright 2025 NXP
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
from typing import Optional
6+
7+
import torch
8+
from torch.export.unflatten import _assign_attr, _AttrKind
9+
from torch.fx import GraphModule, Node
10+
from torch.fx.passes.infra.pass_base import PassBase, PassResult
11+
from torch.nn.parameter import Parameter
12+
from torch.nn.utils import fuse_conv_bn_weights
13+
14+
15+
class FuseBatchNormWithConvPass(PassBase):
16+
"""The executorch batch normalization carries out the following computation [1].
17+
18+
(x - mean) / sqrt(var + eps) * W + B
19+
20+
Which can be expressed as
21+
22+
x * (W / sqrt(var + eps)) + (B - mean * (W / sqrt(var + eps)))
23+
24+
So the batch norm can be done as 1 multiplication and 1 addition, provided that the parameters are static,
25+
and the terms can be precomputed. If there is a `Conv` operator before the batch normalization, this scale and
26+
bias can be statically integrated into the weights and bias of the `Conv`, which allows the batch norm to be
27+
completely removed.
28+
29+
30+
31+
┌─────────────▼─────────────┐
32+
│ aten.conv1d | aten.conv2d │
33+
└─────────────┬─────────────┘
34+
│ │
35+
┌─────────────────────▼─────────────────────┐ replace with ┌─────────────▼─────────────┐
36+
│ aten.batch_norm │ ──────────────► │ aten.conv1d | aten.conv2d │
37+
└─────────────────────┬─────────────────────┘ └─────────────┬─────────────┘
38+
│ ▼
39+
40+
41+
[1] https://github.com/pytorch/executorch/blob/v0.5.0-rc2/kernels/portable/cpu/op_native_batch_norm.cpp#L118-L128
42+
"""
43+
44+
def _get_tensor_constant_from_node(self, graph_module, node) -> Parameter | None:
45+
"""Get the static data from a given node. If it doesn't have any data, return `None`."""
46+
if node is None or node.op != "get_attr":
47+
return None
48+
49+
target_atoms = node.target.split(".")
50+
attr_itr = graph_module
51+
for atom in target_atoms:
52+
if not hasattr(attr_itr, atom):
53+
return None
54+
attr_itr = getattr(attr_itr, atom)
55+
return attr_itr
56+
57+
def call(self, graph_module: GraphModule) -> Optional[PassResult]:
58+
def _is_batch_norm(node_: Node) -> bool:
59+
return (
60+
node_.op == "call_function"
61+
and node_.target == torch.ops.aten.batch_norm.default
62+
)
63+
64+
def _is_conv(node_: Node):
65+
is_conv = node_.op == "call_function" and node_.target in (
66+
torch.ops.aten.conv1d.default,
67+
torch.ops.aten.conv2d.default,
68+
)
69+
has_single_user = len(node.users) == 1
70+
71+
return is_conv and has_single_user
72+
73+
made_changes = False
74+
75+
if not any(map(_is_batch_norm, graph_module.graph.nodes)):
76+
return PassResult(
77+
graph_module, made_changes
78+
) # No batch norm nodes in the model.
79+
80+
for node in graph_module.graph.nodes:
81+
if not _is_batch_norm(node):
82+
continue # Not BatchNorm.
83+
84+
bn_node = node
85+
86+
if not _is_conv(bn_node.args[0]):
87+
continue # Something other than a Conv node comes before the BatchNorm.
88+
89+
conv_node = bn_node.args[0]
90+
conv_weight_node = conv_node.args[1]
91+
conv_bias_node = conv_node.args[2] if len(conv_node.args) > 2 else None
92+
93+
# conv args: input, weight, bias, stride, padding, dilation, ...
94+
conv_w = self._get_tensor_constant_from_node(graph_module, conv_weight_node)
95+
conv_b = self._get_tensor_constant_from_node(graph_module, conv_bias_node)
96+
97+
# batch norm args: input, weight, bias, running mean, training, running var, momentum, eps
98+
bn_w = self._get_tensor_constant_from_node(graph_module, bn_node.args[1])
99+
bn_b = self._get_tensor_constant_from_node(graph_module, bn_node.args[2])
100+
bn_rm = self._get_tensor_constant_from_node(graph_module, bn_node.args[3])
101+
bn_rv = self._get_tensor_constant_from_node(graph_module, bn_node.args[4])
102+
bn_eps = bn_node.args[7]
103+
104+
if any(
105+
t is None for t in (conv_w, bn_rm, bn_rv)
106+
): # The other inputs can be None.
107+
continue # The data is not static. Leave this BatchNorm as is (probably a rare case).
108+
fused_weight, fused_bias = fuse_conv_bn_weights(
109+
conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b
110+
)
111+
112+
# Update the weight and bias for Conv.
113+
conv_args = list(conv_node.args)
114+
if len(conv_args) == 2:
115+
# Fill in the default bias argument.
116+
conv_args.append(None)
117+
118+
weight_attr_name = conv_weight_node.target
119+
_assign_attr(
120+
fused_weight, graph_module, weight_attr_name, _AttrKind.PARAMETER
121+
)
122+
123+
if conv_bias_node is not None:
124+
bias_attr_name = conv_bias_node.target
125+
_assign_attr(
126+
fused_bias, graph_module, str(bias_attr_name), _AttrKind.PARAMETER
127+
)
128+
else:
129+
# The Conv doesn't have a bias. Create a new one.
130+
bias_attr_name = weight_attr_name + "_bias"
131+
_assign_attr(
132+
fused_bias, graph_module, bias_attr_name, _AttrKind.PARAMETER
133+
)
134+
with graph_module.graph.inserting_before(conv_node):
135+
get_bias_node = graph_module.graph.get_attr(bias_attr_name)
136+
137+
conv_args[2] = get_bias_node
138+
139+
conv_node.args = tuple(conv_args)
140+
141+
# Replace the uses of the BatchNorm with the Conv.
142+
bn_node.replace_all_uses_with(conv_node)
143+
144+
made_changes = True
145+
146+
return PassResult(graph_module, made_changes)

0 commit comments

Comments
 (0)