Skip to content

Commit 2407b37

Browse files
authored
Merge branch 'pytorch:main' into op_softmax
2 parents 673d825 + 6b858f2 commit 2407b37

File tree

178 files changed

+1998
-1354
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

178 files changed

+1998
-1354
lines changed

.ci/scripts/build_llama_android.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ build_llama_runner() {
4848
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
4949
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
5050
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
51-
-Bcmake-android-out/examples/models/llama2 examples/models/llama2
51+
-Bcmake-android-out/examples/models/llama examples/models/llama
5252

53-
cmake --build cmake-android-out/examples/models/llama2 -j4 --config Release
53+
cmake --build cmake-android-out/examples/models/llama -j4 --config Release
5454
}
5555
install_flatc_from_source
5656
install_executorch_and_backend_lib

.ci/scripts/test_llama.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ cmake_install_executorch_libraries() {
125125

126126
cmake_build_llama_runner() {
127127
echo "Building llama runner"
128-
dir="examples/models/llama2"
128+
dir="examples/models/llama"
129129
retry cmake \
130130
-DCMAKE_INSTALL_PREFIX=cmake-out \
131131
-DCMAKE_BUILD_TYPE=Debug \
@@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then
206206
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
207207
fi
208208
# Add dynamically linked library location
209-
$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
209+
$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
210210

211211
# Create tokenizer.bin.
212212
echo "Creating tokenizer.bin"
@@ -219,15 +219,15 @@ echo "Running ${EXPORTED_MODEL_NAME} in portable mode"
219219
if [[ "${BUILD_TOOL}" == "buck2" ]]; then
220220
# Run model.
221221
# shellcheck source=/dev/null
222-
$BUCK run examples/models/llama2:main -- ${RUNTIME_ARGS} > result.txt
222+
$BUCK run examples/models/llama:main -- ${RUNTIME_ARGS} > result.txt
223223
elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
224224
cmake_install_executorch_libraries
225225
cmake_build_llama_runner
226226
# Run llama runner
227227
NOW=$(date +"%H:%M:%S")
228228
echo "Starting to run llama runner at ${NOW}"
229229
# shellcheck source=/dev/null
230-
cmake-out/examples/models/llama2/llama_main ${RUNTIME_ARGS} > result.txt
230+
cmake-out/examples/models/llama/llama_main ${RUNTIME_ARGS} > result.txt
231231
NOW=$(date +"%H:%M:%S")
232232
echo "Finished at ${NOW}"
233233
else

.ci/scripts/test_model.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ run_portable_executor_runner() {
7575
test_model() {
7676
if [[ "${MODEL_NAME}" == "llama2" ]]; then
7777
# Install requirements for export_llama
78-
bash examples/models/llama2/install_requirements.sh
79-
# Test export_llama script: python3 -m examples.models.llama2.export_llama
80-
"${PYTHON_EXECUTABLE}" -m examples.models.llama2.export_llama -c examples/models/llama2/params/demo_rand_params.pth -p examples/models/llama2/params/demo_config.json
78+
bash examples/models/llama/install_requirements.sh
79+
# Test export_llama script: python3 -m examples.models.llama.export_llama
80+
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json
8181
run_portable_executor_runner
8282
rm "./${MODEL_NAME}.pte"
8383
fi

.github/scripts/extract_benchmark_results.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import logging
1010
import os
1111
import re
12-
import time
1312
import zipfile
1413
from argparse import Action, ArgumentParser, Namespace
1514
from io import BytesIO
@@ -26,12 +25,15 @@
2625

2726
# iOS-related regexes and variables
2827
IOS_TEST_SPEC_REGEX = re.compile(
29-
r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>\w+)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
28+
r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>[\w\+]+)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
3029
)
3130
IOS_TEST_NAME_REGEX = re.compile(
32-
r"test_(?P<method>forward|load|generate)_(?P<model_name>\w+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
31+
r"test_(?P<method>forward|load|generate)_(?P<model_name>[\w\+]+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
32+
)
33+
# The backend name could contain +, i.e. tinyllama_xnnpack+custom+qe_fp32
34+
IOS_MODEL_NAME_REGEX = re.compile(
35+
r"(?P<model>[^_]+)_(?P<backend>[\w\+]+)_(?P<dtype>\w+)"
3336
)
34-
IOS_MODEL_NAME_REGEX = re.compile(r"(?P<model>[^_]+)_(?P<backend>\w+)_(?P<dtype>\w+)")
3537

3638

3739
class ValidateArtifacts(Action):
@@ -159,19 +161,8 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
159161
ios_ver = m.group("ios_ver").replace("_", ".")
160162
iphone_ver = m.group("iphone_ver").replace("_", ".")
161163

162-
# NB: This looks brittle, but unless we can return iOS benchmark results in JSON
163-
# format by the test, the mapping is needed to match with Android test
164-
if method == "load":
165-
metric = "model_load_time(ms)"
166-
elif method == "forward":
167-
metric = (
168-
"generate_time(ms)"
169-
if "llama" in model_name
170-
else "avg_inference_latency(ms)"
171-
)
172-
elif method == "generate":
173-
metric = "token_per_sec"
174-
164+
# The default backend and quantization dtype if the script couldn't extract
165+
# them from the model name
175166
backend = ""
176167
quantization = "unknown"
177168

@@ -194,8 +185,9 @@ def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
194185
"availMem": 0,
195186
"totalMem": 0,
196187
},
197-
"metric": metric,
188+
"method": method,
198189
# These fields will be populated later by extract_ios_metric
190+
"metric": "",
199191
"actualValue": 0,
200192
"targetValue": 0,
201193
}
@@ -210,10 +202,38 @@ def extract_ios_metric(
210202
"""
211203
Map the metric name from iOS xcresult to the benchmark result
212204
"""
213-
if metric_name == "Clock Monotonic Time, s":
214-
# The benchmark value is in ms
215-
benchmark_result["actualValue"] = metric_value * 1000
216-
elif metric_name == "Tokens Per Second, t/s":
205+
method = benchmark_result.get("method", "")
206+
if not method:
207+
return benchmark_result
208+
209+
# NB: This looks brittle, but unless we can return iOS benchmark results in JSON
210+
# format by the test, the mapping is needed to match with Android test
211+
if method == "load":
212+
if metric_name == "Clock Monotonic Time, s":
213+
benchmark_result["metric"] = "model_load_time(ms)"
214+
benchmark_result["actualValue"] = metric_value * 1000
215+
216+
elif metric_name == "Memory Peak Physical, kB":
217+
# NB: Showing the value in mB is friendlier IMO
218+
benchmark_result["metric"] = "peak_load_mem_usage(mb)"
219+
benchmark_result["actualValue"] = metric_value / 1024
220+
221+
elif method == "forward":
222+
if metric_name == "Clock Monotonic Time, s":
223+
benchmark_result["metric"] = (
224+
"generate_time(ms)"
225+
if "llama" in test_name
226+
else "avg_inference_latency(ms)"
227+
)
228+
benchmark_result["actualValue"] = metric_value * 1000
229+
230+
elif metric_name == "Memory Peak Physical, kB":
231+
# NB: Showing the value in mB is friendlier IMO
232+
benchmark_result["metric"] = "peak_inference_mem_usage(mb)"
233+
benchmark_result["actualValue"] = metric_value / 1024
234+
235+
elif method == "generate" and metric_name == "Tokens Per Second, t/s":
236+
benchmark_result["metric"] = "token_per_sec"
217237
benchmark_result["actualValue"] = metric_value
218238

219239
return benchmark_result
@@ -235,31 +255,33 @@ def extract_ios_benchmark_results(
235255

236256
with request.urlopen(artifact_s3_url) as data:
237257
current_test_name = ""
258+
current_metric_name = ""
238259
current_record = {}
239260

240261
for line in data.read().decode("utf8").splitlines():
241262
s = IOS_TEST_SPEC_REGEX.search(line)
242263
if not s:
243264
continue
244265

245-
test_class = s.group("test_class")
246266
test_name = s.group("test_name")
247267
metric_name = s.group("metric")
248268
metric_value = float(s.group("value"))
249269

250-
if test_name != current_test_name:
251-
if current_record:
270+
if test_name != current_test_name or metric_name != current_metric_name:
271+
if current_record and current_record.get("metric", ""):
252272
# Save the benchmark result in the same format used by Android
253273
benchmark_results.append(current_record.copy())
254274

255275
current_test_name = test_name
276+
current_metric_name = metric_name
256277
current_record = initialize_ios_metadata(current_test_name)
257278

258279
current_record = extract_ios_metric(
259280
current_record, test_name, metric_name, metric_value
260281
)
261282

262-
benchmark_results.append(current_record.copy())
283+
if current_record and current_record.get("metric", ""):
284+
benchmark_results.append(current_record.copy())
263285

264286
return benchmark_results
265287

.github/workflows/android-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ jobs:
160160
161161
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
162162
# Install requirements for export_llama
163-
PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
163+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
164164
# Test llama2
165165
if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
166166
DELEGATE_CONFIG="xnnpack+custom+qe"

.github/workflows/apple-perf.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ jobs:
7676
# on-demand and periodic benchmarking.
7777
CRON_DEFAULT_MODELS: "stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l"
7878
CRON_DEFAULT_DEVICES: "apple_iphone_15"
79-
CRON_DEFAULT_DELEGATES: "xnnpack,coreml,mps"
79+
CRON_DEFAULT_DELEGATES: "nnpack,coreml,mps"
8080
run: |
8181
set -ex
8282
MODELS="${{ inputs.models }}"
@@ -162,7 +162,7 @@ jobs:
162162
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
163163
# Install requirements for export_llama
164164
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
165-
bash examples/models/llama2/install_requirements.sh
165+
bash examples/models/llama/install_requirements.sh
166166
167167
# Test llama2
168168
if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then

.github/workflows/pull.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ jobs:
9898
- dtype: bf16
9999
build-tool: buck2
100100
mode: portable
101+
- dtype: bf16
102+
build-tool: cmake
103+
mode: custom
104+
- dtype: bf16
105+
build-tool: buck2
106+
mode: custom
101107
fail-fast: false
102108
with:
103109
runner: linux.2xlarge
@@ -117,7 +123,7 @@ jobs:
117123
# Setup executorch
118124
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
119125
# Install requirements for export_llama
120-
PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
126+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
121127
# Test llama2
122128
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
123129
@@ -216,7 +222,7 @@ jobs:
216222
bash install_requirements.sh --pybind xnnpack
217223
218224
# install Llava requirements
219-
bash examples/models/llama2/install_requirements.sh
225+
bash examples/models/llama/install_requirements.sh
220226
bash examples/models/llava/install_requirements.sh
221227
222228
# run python unittest
@@ -411,7 +417,7 @@ jobs:
411417
# Setup executorch
412418
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
413419
# Install requirements for export_llama
414-
PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
420+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
415421
# Test llama2
416422
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
417423

.github/workflows/trunk.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,8 @@ jobs:
227227
include:
228228
- dtype: bf16
229229
mode: portable
230+
- dtype: bf16
231+
mode: custom
230232
fail-fast: false
231233
with:
232234
runner: macos-m1-stable
@@ -255,7 +257,7 @@ jobs:
255257
fi
256258
257259
# Install requirements for export_llama
258-
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
260+
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
259261
# Test llama2
260262
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}"
261263
@@ -279,7 +281,7 @@ jobs:
279281
# GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
280282

281283
# # install Llava requirements
282-
# ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
284+
# ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
283285
# ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
284286

285287
# # run python unittest
@@ -385,7 +387,7 @@ jobs:
385387
cmake --build cmake-out -j9 --target install --config Release
386388
387389
echo "Build llama runner"
388-
dir="examples/models/llama2"
390+
dir="examples/models/llama"
389391
cmake \
390392
-DCMAKE_INSTALL_PREFIX=cmake-out \
391393
-DCMAKE_BUILD_TYPE=Release \
@@ -437,5 +439,5 @@ jobs:
437439
438440
python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME}
439441
440-
cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
442+
cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
441443
echo "::endgroup::"

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ please visit our documentation website [for the latest release](https://pytorch.
2222

2323
Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.
2424

25-
Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
25+
Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
2626

2727

28-
**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama2/README.md) models via ExecuTorch.
28+
**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama/README.md) models via ExecuTorch.
2929

3030
## Feedback
3131

backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -253,11 +253,11 @@ bool write_directory_node(InMemoryDirectoryNode* node,
253253
return false;
254254
}
255255

256-
for (const auto& [_, node]: node->get_items()) {
257-
if (node.get()->isDirectory() && !recursive) {
256+
for (const auto& [_, node_2]: node->get_items()) {
257+
if (node_2.get()->isDirectory() && !recursive) {
258258
continue;
259259
}
260-
if (!write_node(node.get(), dir_path, recursive, error)) {
260+
if (!write_node(node_2.get(), dir_path, recursive, error)) {
261261
return false;
262262
}
263263
}
@@ -383,9 +383,9 @@ FlattenedInMemoryNode::unflatten(const std::vector<FlattenedInMemoryNode>& flatt
383383
case InMemoryFileSystem::InMemoryNode::Kind::Directory: {
384384
std::unordered_map<std::string, std::unique_ptr<InMemoryFileSystem::InMemoryNode>> items;
385385
items.reserve(flattened_node_metadata.child_name_to_indices_map.size());
386-
for (const auto& [name, index]: flattened_node_metadata.child_name_to_indices_map) {
387-
auto moveIt = std::make_move_iterator(nodes.begin() + index);
388-
items[name] = *moveIt;
386+
for (const auto& [name_2, index_2]: flattened_node_metadata.child_name_to_indices_map) {
387+
auto moveIt = std::make_move_iterator(nodes.begin() + index_2);
388+
items[name_2] = *moveIt;
389389
}
390390
auto directory_node =
391391
std::make_unique<InMemoryDirectoryNode>(std::move(name), std::move(attributes), std::move(items));

backends/qualcomm/_passes/annotate_quant_attrs.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,12 @@ class AnnotateQuantAttrs(ExportPass):
2727
generated after quatization process.
2828
"""
2929

30-
def __init__(self, edge_program: torch.export.ExportedProgram):
30+
def __init__(
31+
self, edge_program: torch.export.ExportedProgram, skip_advanced_requat: bool
32+
):
3133
super(AnnotateQuantAttrs, self).__init__()
3234
self.edge_program = edge_program
35+
self.skip_advanced_requant = skip_advanced_requat
3336

3437
def _annotate_source_nodes(
3538
self, quant_node: torch.fx.Node, quant_attrs: Dict[str, Any]
@@ -68,9 +71,26 @@ def _annotate_requant(self, n):
6871

6972
# TODO: Store multiple pairs of requantize attributes when we have an op builder
7073
# that has multiple outputs that requires quant attributes.
71-
if q_attrs["dtype"] != dq_attrs["dtype"]:
72-
dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
73-
n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
74+
if self.skip_advanced_requant:
75+
if q_attrs["dtype"] != dq_attrs["dtype"]:
76+
dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
77+
n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
78+
else:
79+
# When dtype is the same but other specs such as scale and offset are different,
80+
# insert requant to improve accuracy.
81+
# Users can turn this feature off if any inference speed drop is observed.
82+
if any(
83+
q_attrs[attr] != dq_attrs[attr]
84+
for attr in [
85+
"scale",
86+
"zero_point",
87+
"quant_min",
88+
"quant_max",
89+
"dtype",
90+
]
91+
):
92+
dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING]
93+
n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs
7494

7595
# Dequant all the fold_quant parameters back to fp32.
7696
# If an operation is not supported by QNN and got fallback, it will expect a fp32 param.

0 commit comments

Comments
 (0)