Skip to content

Commit efd6c08

Browse files
committed
Update on "[ExecuTorch] Dramatically improve op_clamp build time"
Instead of building `O(|CTYPE_IN| * |CTYPE_MIN| * |CTYPE_MAX| * |CTYPE_OUT|)` kernel code (where |T| means the number of possibilities for type T), we build `O((|CTYPE_IN| + |CTYPE_MIN| + |CTYPE_MAX| + |CTYPE_COMMON|) * |CTYPE_OUT|)` kernel code. (Concretely, `ET_SWITCH_REALHB_TYPES` has 9 possibilities, so I estimate that we went from 9**4 = 6561 template instantiations to 9 * 4 * 9 = 324 instantiations, or a 20x reduction.) Differential Revision: [D63681034](https://our.internmc.facebook.com/intern/diff/D63681034/) [ghstack-poisoned]
2 parents 69461cb + 7b158e7 commit efd6c08

File tree

677 files changed

+4970
-7608
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

677 files changed

+4970
-7608
lines changed

.ci/scripts/test_llama.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ echo "Creating tokenizer.bin"
213213
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
214214

215215

216-
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10"
216+
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10 --warmup=1"
217217
# Check build tool.
218218
echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
219219
if [[ "${BUILD_TOOL}" == "buck2" ]]; then

.github/workflows/apple.yml

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ on:
1313
- install_requirements.sh
1414
- backends/apple/**
1515
- build/build_apple_frameworks.sh
16+
- build/build_apple_llm_demo.sh
1617
- build/create_frameworks.sh
1718
- build/test_ios_ci.sh
1819
- examples/demo-apps/apple_ios/**
@@ -215,3 +216,70 @@ jobs:
215216
shasum -a 256 "${FILENAME}"
216217
${AWS_CMD} "${FILENAME}" s3://ossci-ios/executorch/ --acl public-read
217218
done
219+
220+
build-benchmark-app:
221+
name: build-benchmark-app
222+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
223+
secrets: inherit
224+
with:
225+
runner: macos-latest-xlarge
226+
python-version: '3.11'
227+
submodules: 'true'
228+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
229+
upload-artifact: ios-apps
230+
secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD
231+
timeout: 90
232+
script: |
233+
set -eux
234+
235+
echo "::group::Setting up CI environment"
236+
.ci/scripts/setup-conda.sh
237+
238+
BUILD_TOOL=cmake
239+
# Setup MacOS dependencies as there is no Docker support on MacOS atm
240+
GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
241+
.ci/scripts/setup-macos.sh "${BUILD_TOOL}"
242+
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
243+
244+
# Setup Apple certificate for iOS development
245+
BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \
246+
BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \
247+
KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \
248+
.ci/scripts/setup-ios.sh
249+
250+
# Install CoreML Backend Requirements
251+
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
252+
backends/apple/coreml/scripts/install_requirements.sh
253+
254+
# Install MPS Backend Requirements
255+
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
256+
backends/apple/mps/install_requirements.sh
257+
echo "::endgroup::"
258+
259+
echo "::group::Build ExecuTorch iOS frameworks"
260+
FRAMEWORKS=(
261+
"executorch"
262+
"backend_coreml"
263+
"backend_mps"
264+
"backend_xnnpack"
265+
"kernels_custom"
266+
"kernels_optimized"
267+
"kernels_portable"
268+
"kernels_quantized"
269+
)
270+
271+
# Build Release iOS Frameworks
272+
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
273+
build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack
274+
275+
mkdir -p extension/apple/Benchmark/Frameworks
276+
for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
277+
cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/apple/Benchmark/Frameworks/
278+
) done
279+
echo "::endgroup::"
280+
281+
echo "::group::Build ExecuTorch benchmark app"
282+
mkdir -p extension/apple/Benchmark/Models
283+
${CONDA_RUN} --no-capture-output \
284+
build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME}
285+
echo "::endgroup::"

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
.hypothesis
22
buck-out/
3-
cmake-out/
3+
cmake-out*
4+
.DS_Store
45
cmake-android-out/
56
cmake-out-android/
67
cmake-ios-out/

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,7 @@ if(EXECUTORCH_BUILD_PYBIND)
727727
util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
728728
)
729729
target_compile_options(util PUBLIC ${_pybind_compile_options})
730-
target_link_libraries(util PRIVATE torch c10 executorch)
730+
target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)
731731

732732
# pybind portable_lib
733733
pybind11_add_module(portable_lib SHARED extension/pybindings/pybindings.cpp)

backends/arm/test/models/test_mobilenet_v2_arm.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,11 @@ def test_mv2_u55_BI(self):
100100
)
101101
if common.is_option_enabled("corstone300"):
102102
tester.run_method_and_compare_outputs(
103-
atol=1.0, qtol=1, inputs=self.model_inputs
103+
atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-300"
104104
)
105105

106106
def test_mv2_u85_BI(self):
107-
(
107+
tester = (
108108
ArmTester(
109109
self.mv2,
110110
example_inputs=self.model_inputs,
@@ -116,4 +116,9 @@ def test_mv2_u85_BI(self):
116116
.check(list(self.operators_after_quantization))
117117
.partition()
118118
.to_executorch()
119+
.serialize()
119120
)
121+
if common.is_option_enabled("corstone300"):
122+
tester.run_method_and_compare_outputs(
123+
atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-320"
124+
)

backends/arm/test/ops/test_add.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,16 +137,22 @@ def test_add_u55_BI(self, test_data: torch.Tensor):
137137
test_data,
138138
)
139139
if common.is_option_enabled("corstone300"):
140-
tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
140+
tester.run_method_and_compare_outputs(
141+
qtol=1, inputs=test_data, target_board="corstone-300"
142+
)
141143

142144
@parameterized.expand(Add.test_parameters)
143145
def test_add_u85_BI(self, test_data: torch.Tensor):
144146
test_data = (test_data,)
145-
self._test_add_ethos_BI_pipeline(
147+
tester = self._test_add_ethos_BI_pipeline(
146148
self.Add(),
147149
common.get_u85_compile_spec(permute_memory_to_nhwc=True),
148150
test_data,
149151
)
152+
if common.is_option_enabled("corstone300"):
153+
tester.run_method_and_compare_outputs(
154+
qtol=1, inputs=test_data, target_board="corstone-320"
155+
)
150156

151157
@parameterized.expand(Add2.test_parameters)
152158
def test_add2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
@@ -165,11 +171,17 @@ def test_add2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
165171
self.Add2(), common.get_u55_compile_spec(), test_data
166172
)
167173
if common.is_option_enabled("corstone300"):
168-
tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
174+
tester.run_method_and_compare_outputs(
175+
qtol=1, inputs=test_data, target_board="corstone-300"
176+
)
169177

170178
@parameterized.expand(Add2.test_parameters)
171179
def test_add2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
172180
test_data = (operand1, operand2)
173-
self._test_add_ethos_BI_pipeline(
181+
tester = self._test_add_ethos_BI_pipeline(
174182
self.Add2(), common.get_u85_compile_spec(), test_data
175183
)
184+
if common.is_option_enabled("corstone300"):
185+
tester.run_method_and_compare_outputs(
186+
qtol=1, inputs=test_data, target_board="corstone-320"
187+
)

backends/arm/test/runner_utils.py

Lines changed: 79 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77
import logging
88
import os
9+
import re
910
import shutil
1011
import subprocess
1112
import tempfile
@@ -176,6 +177,7 @@ def __init__(
176177
self.qp_input: list[QuantizationParams] = None
177178
self.qp_output: QuantizationParams = None
178179
self.timeout = 120
180+
self.target_board: str = None
179181

180182
self._has_init_run = False
181183

@@ -184,11 +186,17 @@ def init_run(
184186
exported_program: ExportedProgram,
185187
edge_program: ExportedProgram,
186188
is_quantized: bool,
189+
target_board: str,
187190
):
191+
192+
if target_board not in ["corstone-300", "corstone-320"]:
193+
raise RuntimeError(f"Unknown target board: {target_board}")
194+
188195
self.input_names = _get_input_names(edge_program)
189196
self.output_node = _get_output_node(exported_program)
190197
self.output_name = self.output_node.name
191198
self.is_quantized = is_quantized
199+
self.target_board = target_board
192200

193201
if is_quantized:
194202
self.qp_input = _get_input_quantization_params(exported_program)
@@ -204,7 +212,7 @@ def init_run(
204212
def set_timeout(self, timeout: int):
205213
self.timeout = timeout
206214

207-
def run_corstone300(
215+
def run_corstone(
208216
self,
209217
inputs: Tuple[torch.Tensor],
210218
) -> list[torch.Tensor]:
@@ -229,7 +237,9 @@ def run_corstone300(
229237
os.path.join(self.intermediate_path, f"{name}.bin"),
230238
)
231239
elf_path = os.path.join(
232-
"cmake-out", "arm_semihosting_executor_runner", "arm_executor_runner"
240+
"cmake-out",
241+
f"arm_semihosting_executor_runner_{self.target_board}",
242+
"arm_executor_runner",
233243
)
234244
assert os.path.exists(
235245
elf_path
@@ -239,39 +249,76 @@ def run_corstone300(
239249
for input_path in input_paths:
240250
cmd_line += f" -i {input_path}"
241251

242-
command_args = [
243-
"FVP_Corstone_SSE-300_Ethos-U55",
244-
"-C",
245-
"ethosu.num_macs=128",
246-
"-C",
247-
"mps3_board.visualisation.disable-visualisation=1",
248-
"-C",
249-
"mps3_board.telnetterminal0.start_telnet=0",
250-
"-C",
251-
"mps3_board.uart0.out_file='-'",
252-
"-C",
253-
"cpu0.CFGITCMSZ=11",
254-
"-C",
255-
"cpu0.semihosting-enable=1",
256-
"-C",
257-
"cpu0.semihosting-stack_base=0",
258-
"-C",
259-
"cpu0.semihosting-heap_limit=0",
260-
"-C",
261-
f"cpu0.semihosting-cmd_line='{cmd_line}'",
262-
"-a",
263-
elf_path,
264-
"--timelimit",
265-
f"{self.timeout}",
266-
]
267-
result = _run_cmd(command_args, check=False)
252+
command_args = {
253+
"corstone-300": [
254+
"FVP_Corstone_SSE-300_Ethos-U55",
255+
"-C",
256+
"ethosu.num_macs=128",
257+
"-C",
258+
"mps3_board.visualisation.disable-visualisation=1",
259+
"-C",
260+
"mps3_board.telnetterminal0.start_telnet=0",
261+
"-C",
262+
"mps3_board.uart0.out_file='-'",
263+
"-C",
264+
"cpu0.CFGITCMSZ=11",
265+
"-C",
266+
"cpu0.semihosting-enable=1",
267+
"-C",
268+
"cpu0.semihosting-stack_base=0",
269+
"-C",
270+
"cpu0.semihosting-heap_limit=0",
271+
"-C",
272+
f"cpu0.semihosting-cmd_line='{cmd_line}'",
273+
"-a",
274+
elf_path,
275+
"--timelimit",
276+
f"{self.timeout}",
277+
],
278+
"corstone-320": [
279+
"FVP_Corstone_SSE-320",
280+
"-C",
281+
"mps4_board.subsystem.ethosu.num_macs=128",
282+
"-C",
283+
"mps4_board.visualisation.disable-visualisation=1",
284+
"-C",
285+
"mps4_board.telnetterminal0.start_telnet=0",
286+
"-C",
287+
"mps4_board.uart0.out_file='-'",
288+
"-C",
289+
"mps4_board.uart0.unbuffered_output=1",
290+
"-C",
291+
"mps4_board.uart0.shutdown_on_eot=1",
292+
"-C",
293+
"mps4_board.subsystem.cpu0.semihosting-enable=1",
294+
"-C",
295+
"mps4_board.subsystem.cpu0.semihosting-stack_base=0",
296+
"-C",
297+
"mps4_board.subsystem.cpu0.semihosting-heap_limit=0",
298+
"-C",
299+
f"mps4_board.subsystem.cpu0.semihosting-cmd_line='{cmd_line}'",
300+
"-a",
301+
elf_path,
302+
"--timelimit",
303+
f"{self.timeout}",
304+
],
305+
}
306+
307+
result = _run_cmd(command_args[self.target_board], check=False)
308+
if result.returncode != 0:
309+
raise RuntimeError(
310+
f"Failed to run {command_args[self.target_board]}\nError: {result.stderr.decode()}"
311+
)
268312
result_stdout = result.stdout.decode()
269-
if "Hard fault" in result_stdout or len(result.stderr) > 0:
313+
314+
error_regex = r"(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)"
315+
316+
# Check for errors in the output
317+
# regex to check for error or fault messages in stdout from FVP
318+
if re.compile(error_regex, re.MULTILINE).search(result_stdout):
270319
raise RuntimeError(
271-
f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}"
320+
f"Corstone simulation failed:\ncmd: {command_args[self.target_board]}\n, log: \n {result_stdout}\n{result.stderr.decode()}"
272321
)
273-
elif "E [" in result_stdout:
274-
logger.error(result_stdout)
275322

276323
tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32)
277324
output_shape = self.output_node.args[0][0].meta["val"].shape

backends/arm/test/setup_testing.sh

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,30 @@ ethos_u_root_dir=${et_root_dir}/examples/arm/ethos-u-scratch/ethos-u
1313

1414
toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake
1515
et_build_dir=${et_root_dir}/cmake-out
16-
build_test_dir=${et_build_dir}/arm_semihosting_executor_runner
16+
build_root_test_dir=${et_build_dir}/arm_semihosting_executor_runner
1717
fvp_model=FVP_Corstone_SSE-300_Ethos-U55
1818

1919
# Build Arm Baremetal executor_runner in semihosting mode.
2020
# Put in backends/arm/test/res to be used by unit tests.
2121
function build_semihosting_executorch_runner() {
22+
target_board=$1
23+
build_test_dir=${build_root_test_dir}_${target_board}
24+
echo "[${FUNCNAME[0]}] Configuring ${target_board}"
25+
if [[ ${target_board} == "corstone-300" ]]; then
26+
local target_cpu=cortex-m55
27+
elif [[ ${target_board} == "corstone-320" ]]; then
28+
local target_cpu=cortex-m85
29+
else
30+
echo "[${FUNCNAME[0]}] ERROR: Invalid target_board specified!"
31+
exit 1
32+
fi
2233
cd ${et_root_dir}/examples/arm/executor_runner
2334
pwd
2435
mkdir -p ${build_test_dir}
2536
cmake -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake} \
26-
-DTARGET_CPU=cortex-m55 \
37+
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
38+
-DTARGET_CPU=${target_cpu} \
39+
-DTARGET_BOARD=${target_board} \
2740
-DSEMIHOSTING=ON \
2841
-DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${build_test_dir} \
2942
-B ${build_test_dir} \
@@ -40,4 +53,6 @@ function build_semihosting_executorch_runner() {
4053
find ${build_test_dir} -name "arm_executor_runner"
4154
}
4255

43-
build_semihosting_executorch_runner
56+
build_semihosting_executorch_runner corstone-300
57+
58+
build_semihosting_executorch_runner corstone-320

0 commit comments

Comments
 (0)