Skip to content

Commit f201aec

Browse files
authored
Merge branch 'pytorch:main' into Arm-backend-Test-TOSA,-Ethos-U55-and-Ethos-U85-on-github
2 parents e7d615f + fbee0c8 commit f201aec

File tree

91 files changed

+2318
-906
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+2318
-906
lines changed

.ci/scripts/test_llama.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ while [[ $# -gt 0 ]]; do
2727
MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
2828
shift 2
2929
;;
30+
-pt2e_quantize)
31+
PT2E_QUANTIZE="$2"
32+
shift 2
33+
;;
3034
-upload)
3135
UPLOAD_DIR="$2"
3236
shift 2
@@ -234,6 +238,10 @@ if [[ "${COREML}" == "ON" ]]; then
234238
fi
235239
if [[ "${QNN}" == "ON" ]]; then
236240
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
241+
echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
242+
if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
243+
EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
244+
fi
237245
fi
238246
# Add dynamically linked library location
239247
$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}

.github/workflows/trunk.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,3 +442,39 @@ jobs:
442442
443443
cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
444444
echo "::endgroup::"
445+
446+
447+
test-llama-runner-qnn-linux:
448+
name: test-llama-runner-qnn-linux
449+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
450+
strategy:
451+
matrix:
452+
dtype: [fp32]
453+
pt2e_quantize: [qnn_16a16w, qnn_8a8w]
454+
mode: [qnn]
455+
fail-fast: false
456+
with:
457+
runner: linux.2xlarge
458+
docker-image: executorch-ubuntu-22.04-qnn-sdk
459+
submodules: 'true'
460+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
461+
timeout: 900
462+
script: |
463+
# The generic Linux job chooses to use base env, not the one setup by the image
464+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
465+
conda activate "${CONDA_ENV}"
466+
467+
BUILD_TOOL="cmake"
468+
DTYPE=${{ matrix.dtype }}
469+
MODE=${{ matrix.mode }}
470+
PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
471+
472+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
473+
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
474+
475+
# Setup executorch
476+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
477+
# Install requirements for export_llama
478+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
479+
# Test llama2
480+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"

CONTRIBUTING.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,10 +283,15 @@ for basics.
283283
- If the reviewers have requests or questions, follow up with them.
284284
- The goal of the reviewer is to ensure that the code in the `main` branch of
285285
the repo is consistent, maintainable, and of high quality.
286-
1. Once approved, your reviewer will import the PR into Meta's internal system
287-
and merge it from there.
288-
- If the PR is approved and not merged within a few business days, please
289-
comment on the PR to ask about its status.
286+
1. Once the PR has been approved,
287+
- If you have the "write permission" in this repo, you can merge it yourself
288+
by clicking the "Squash and merge" button once it is green and all CI
289+
signals are passing.
290+
- If you don't have "write permission" in this repo, the reviewer will take
291+
care of the PR. The reviewer may import the PR into Meta's internal system
292+
to validate it against internal CI.
293+
- If the PR is approved but not merged within 5 business days, please comment
294+
on the PR to ask about its status.
290295
- Note that if the `main` [CI](#continuous-integration) jobs are broken, we
291296
will only merge PRs that fix the broken jobs until all critical jobs are
292297
fixed.

backends/apple/coreml/runtime/delegate/ETCoreMLModelCompiler.mm

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,25 +26,38 @@ + (nullable NSURL *)compileModelAtURL:(NSURL *)modelURL
2626
#else
2727
__block NSError *localError = nil;
2828
__block NSURL *result = nil;
29-
30-
dispatch_semaphore_t sema = dispatch_semaphore_create(0);
31-
[MLModel compileModelAtURL:modelURL completionHandler:^(NSURL * _Nullable tempURL, NSError * _Nullable compilationError) {
32-
result = [tempURL copy];
33-
localError = compilationError;
34-
dispatch_semaphore_signal(sema);
35-
}];
36-
37-
long status = dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(maxWaitTimeInSeconds * NSEC_PER_SEC)));
38-
if (status != 0) {
29+
30+
if (@available(iOS 16, macOS 13, watchOS 9, tvOS 16, *)) {
31+
dispatch_semaphore_t sema = dispatch_semaphore_create(0);
32+
[MLModel compileModelAtURL:modelURL completionHandler:^(NSURL * _Nullable tempURL, NSError * _Nullable compilationError) {
33+
result = [tempURL copy];
34+
localError = compilationError;
35+
dispatch_semaphore_signal(sema);
36+
}];
37+
38+
long status = dispatch_semaphore_wait(sema, dispatch_time(DISPATCH_TIME_NOW, (int64_t)(maxWaitTimeInSeconds * NSEC_PER_SEC)));
39+
if (status != 0) {
40+
ETCoreMLLogErrorAndSetNSError(error,
41+
ETCoreMLErrorCompilationFailed,
42+
"%@: Failed to compile model in %f seconds.",
43+
NSStringFromClass(ETCoreMLModelCompiler.class),
44+
maxWaitTimeInSeconds);
45+
return nil;
46+
}
47+
} else {
48+
result = [MLModel compileModelAtURL:modelURL error:&localError];
49+
}
50+
51+
if (localError) {
3952
ETCoreMLLogErrorAndSetNSError(error,
40-
ETCoreMLErrorCompilationFailed,
41-
"%@: Failed to compile model in %f seconds.",
42-
NSStringFromClass(ETCoreMLModelCompiler.class),
43-
maxWaitTimeInSeconds);
53+
ETCoreMLErrorCompilationFailed,
54+
"%@: Failed to compile model, error: %@",
55+
NSStringFromClass(ETCoreMLModelCompiler.class),
56+
localError);
4457
return nil;
58+
} else {
59+
return result;
4560
}
46-
47-
return result;
4861
#endif
4962
}
5063

backends/apple/coreml/scripts/install_requirements.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
2424
mkdir "$COREML_DIR_PATH/third-party"
2525

2626
echo "${green}ExecuTorch: Cloning coremltools."
27-
git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
27+
git clone --depth 1 --branch 8.1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
2828
cd $COREMLTOOLS_DIR_PATH
2929

3030
STATUS=$?

backends/apple/coreml/test/test_coreml_partitioner.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -71,23 +71,15 @@ def test_vit_skip_conv(self):
7171
)
7272
)
7373

74-
conv_block = ["aten.convolution.default", "executorch_call_delegate"]
75-
safe_softmax_block = [
76-
"getitem",
77-
"getitem",
78-
"getitem",
79-
"getitem",
80-
"aten.any.dim",
81-
"executorch_call_delegate",
82-
]
83-
final_block = ["getitem"]
84-
total = conv_block + 12 * safe_softmax_block + final_block
85-
8674
assert [
8775
node.target.__name__
8876
for node in delegated_program_manager.exported_program().graph.nodes
8977
if node.op == "call_function"
90-
] == total
78+
] == [
79+
"aten.convolution.default",
80+
"executorch_call_delegate",
81+
"getitem",
82+
]
9183

9284
def test_buffer(self):
9385
embedding_dim = 3

backends/arm/arm_backend.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def __init__(self):
5252
self.permute_nhwc = False
5353
self.quantize_io = False
5454
self.tosa_version = None
55+
self.input_order = None
5556

5657
def ethosu_compile_spec(
5758
self,
@@ -89,7 +90,7 @@ def ethosu_compile_spec(
8990
self.compiler_flags.append(extra_flags)
9091

9192
base_tosa_version = "TOSA-0.80.0+BI"
92-
if "U55" in config:
93+
if "u55" in config:
9394
# Add the Ethos-U55 extension marker
9495
base_tosa_version += "+u55"
9596
self.tosa_version = TosaSpecification.create_from_string(base_tosa_version)
@@ -134,6 +135,14 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
134135
self.quantize_io = quantize_io
135136
return self
136137

138+
def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder":
139+
"""
140+
Reorder the inputs coming in. This may be required when inputs > 1.
141+
And while using the U55/U85 CompileSpec.
142+
"""
143+
self.input_order = input_order
144+
return self
145+
137146
def build(self) -> List[CompileSpec]:
138147
"""
139148
Generate a list of compile spec objects from the builder
@@ -163,6 +172,13 @@ def build(self) -> List[CompileSpec]:
163172
CompileSpec("permute_memory_format", "nhwc".encode())
164173
)
165174

175+
if self.input_order:
176+
self.compile_spec.append(
177+
CompileSpec(
178+
"input_order", " ".join(map(str, self.input_order)).encode()
179+
)
180+
)
181+
166182
if self.quantize_io:
167183
self.compile_spec.append(CompileSpec("quantize_io", "True".encode()))
168184

@@ -214,13 +230,16 @@ def preprocess( # noqa: C901
214230
artifact_path = None
215231
output_format = ""
216232
compile_flags = []
233+
input_order = []
217234
for spec in compile_spec:
218235
if spec.key == "debug_artifact_path":
219236
artifact_path = spec.value.decode()
220237
if spec.key == "output_format":
221238
output_format = spec.value.decode()
222239
if spec.key == "compile_flags":
223240
compile_flags.append(spec.value.decode())
241+
if spec.key == "input_order":
242+
input_order = list(map(int, spec.value.decode().split(",")))
224243

225244
# Check that the output format is set in the compile spec
226245
if not output_format:
@@ -246,19 +265,27 @@ def preprocess( # noqa: C901
246265
)
247266

248267
node_visitors = get_node_visitors(edge_program, tosa_spec)
249-
268+
input_count = 0
250269
for node in graph_module.graph.nodes:
251270
if node.op == "call_function":
252271
process_call_function(node, tosa_graph, node_visitors, tosa_spec)
253272
elif node.op == "placeholder":
254273
process_placeholder(node, tosa_graph, edge_program, tosa_spec)
274+
if node.name in edge_program.graph_signature.user_inputs:
275+
input_count += 1
255276
elif node.op == "output":
256277
process_output(node, tosa_graph)
257278
else:
258279
# This will only happen if an unpartitioned graph is passed without
259280
# any checking of compatibility.
260281
dbg_fail(node, tosa_graph, artifact_path)
261282

283+
if len(input_order) > 0:
284+
if input_count != len(input_order):
285+
raise RuntimeError(
286+
"The rank of the input order is not equal to amount of input tensors"
287+
)
288+
262289
# TODO: It would be awesome if this dump could somehow be done on top level and not here.
263290
# Problem is that the desc.json has to be created on the tosa_graph object, which we can't
264291
# access from top level.
@@ -275,7 +302,7 @@ def preprocess( # noqa: C901
275302
# preprocess and some consume TOSA fb directly.
276303
if output_format == "vela":
277304
# Emit vela_bin_stream format
278-
binary = vela_compile(tosa_graph, compile_flags)
305+
binary = vela_compile(tosa_graph, compile_flags, input_order)
279306
elif output_format == "tosa":
280307
# Emit TOSA flatbuffer
281308
binary = bytes(tosa_graph.serialize())

backends/arm/arm_vela.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@
1717

1818
# Pack either input or output tensor block, compose the related arrays into
1919
# per-io structs to simplify runtime use.
20-
def vela_bin_pack_io(prefix, data):
21-
ios = struct.pack("<i", len(data[prefix + "_shape"]))
22-
for i in range(len(data[prefix + "_shape"])):
23-
io_shape = data[prefix + "_shape"][i]
20+
def vela_bin_pack_io(prefix, data, shape_order=None):
21+
vela_input_shapes = data[prefix + "_shape"]
22+
23+
order = shape_order if shape_order else range(len(vela_input_shapes))
24+
ios = struct.pack("<i", len(vela_input_shapes))
25+
for i in order:
26+
io_shape = vela_input_shapes[i]
2427
io_elem_size = data[prefix + "_elem_size"][i]
2528
io_offset = data[prefix + "_offset"][i]
2629
io_region = data[prefix + "_region"][i]
@@ -36,7 +39,7 @@ def vela_bin_pack_io(prefix, data):
3639
# Output via Vela to binary stream for ArmBackendEthosU
3740
# WARNING: Do not change this without changing VelaBinStream.cpp as that
3841
# function consumes this format and the two need to align.
39-
def vela_compile(tosa_graph, args: List[str]):
42+
def vela_compile(tosa_graph, args: List[str], shape_order=None):
4043
with tempfile.TemporaryDirectory() as tmpdir:
4144
tosaname = "out.tosa"
4245
flatbuffer = tosa_graph.serialize()
@@ -78,7 +81,7 @@ def vela_compile(tosa_graph, args: List[str]):
7881
bin_blocks["scratch_data"] = b"\x00" * block_length
7982

8083
# Capture inputs and outputs
81-
bin_blocks["inputs"] = vela_bin_pack_io("input", data)
84+
bin_blocks["inputs"] = vela_bin_pack_io("input", data, shape_order)
8285
bin_blocks["outputs"] = vela_bin_pack_io("output", data)
8386

8487
bin_blocks["vela_end_stream"] = b""

backends/arm/operator_support/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,9 @@
55

66
# pyre-unsafe
77

8-
from . import mean_dim_support, tosa_supported_operators, var_correction_support # noqa
8+
from . import ( # noqa
9+
mean_dim_support,
10+
right_shift_support,
11+
tosa_supported_operators,
12+
var_correction_support,
13+
)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Copyright 2024 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
7+
import logging
8+
9+
import torch.fx as fx
10+
from executorch.backends.arm.operator_support.tosa_supported_operators import (
11+
register_tosa_support_check,
12+
SupportedTOSAOperatorCheck,
13+
)
14+
from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
15+
from executorch.exir.dialects._ops import ops as exir_ops
16+
17+
logger = logging.getLogger(__name__)
18+
logger.setLevel(logging.WARNING)
19+
20+
21+
@register_tosa_support_check
22+
class RightShiftSupported(SupportedTOSAOperatorCheck):
23+
targets = [exir_ops.edge.aten.__rshift__.Scalar]
24+
25+
tosa_specs = [
26+
TosaSpecification.create_from_string("TOSA-0.80.0+BI"),
27+
TosaSpecification.create_from_string("TOSA-0.80.0+MI"),
28+
]
29+
30+
def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
31+
32+
# TODO MLETORCH-525 Remove warning
33+
if isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset:
34+
logging.warning(f"{node.target} may introduce one-off errors.")
35+
return True

backends/arm/operators/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
op_reciprocal,
2828
op_relu,
2929
op_repeat,
30+
op_rshift,
3031
op_rsqrt,
3132
op_select,
3233
op_sigmoid,

0 commit comments

Comments
 (0)