Skip to content

Commit f7ec0af

Browse files
committed
Update base for Update on "Transform model to be able to use Attention Sink"
This PR adds necessary functions for transforming the model to be able to use Attention Sink. Differential Revision: [D65571289](https://our.internmc.facebook.com/intern/diff/D65571289/) [ghstack-poisoned]
2 parents b0d0a77 + b8fbc48 commit f7ec0af

File tree

67 files changed

+3840
-494
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+3840
-494
lines changed

.ci/scripts/gather_test_models.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@
2020
CUSTOM_RUNNERS = {
2121
"linux": {
2222
# This one runs OOM on smaller runner, the root cause is unclear (T163016365)
23-
"w2l": "linux.12xlarge",
24-
"ic4": "linux.12xlarge",
25-
"resnet50": "linux.12xlarge",
26-
"llava": "linux.12xlarge",
27-
"llama3_2_vision_encoder": "linux.12xlarge",
28-
# "llama3_2_text_decoder": "linux.12xlarge", # TODO: re-enable test when Huy's change is in / model gets smaller.
23+
"w2l": "linux.4xlarge.memory",
24+
"ic4": "linux.4xlarge.memory",
25+
"resnet50": "linux.4xlarge.memory",
26+
"llava": "linux.4xlarge.memory",
27+
"llama3_2_vision_encoder": "linux.4xlarge.memory",
28+
"llama3_2_text_decoder": "linux.4xlarge.memory",
2929
# This one causes timeout on smaller runner, the root cause is unclear (T161064121)
30-
"dl3": "linux.12xlarge",
31-
"emformer_join": "linux.12xlarge",
32-
"emformer_predict": "linux.12xlarge",
30+
"dl3": "linux.4xlarge.memory",
31+
"emformer_join": "linux.4xlarge.memory",
32+
"emformer_predict": "linux.4xlarge.memory",
3333
}
3434
}
3535

@@ -39,10 +39,12 @@
3939
"linux": {
4040
"mobilebert": 90,
4141
"emformer_predict": 360,
42+
"llama3_2_text_decoder": 360,
4243
},
4344
"macos": {
4445
"mobilebert": 90,
4546
"emformer_predict": 360,
47+
"llama3_2_text_decoder": 360,
4648
},
4749
}
4850

.ci/scripts/setup-macos.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ install_buck() {
4949

5050
rm "${BUCK2}"
5151
popd
52+
53+
# Kill all running buck2 daemon for a fresh start
54+
buck2 killall || true
5255
}
5356

5457
function write_sccache_stub() {

.ci/scripts/test_llama.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ while [[ $# -gt 0 ]]; do
2727
MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
2828
shift 2
2929
;;
30+
-pt2e_quantize)
31+
PT2E_QUANTIZE="$2"
32+
shift 2
33+
;;
3034
-upload)
3135
UPLOAD_DIR="$2"
3236
shift 2
@@ -44,6 +48,9 @@ MODE=${MODE:-"xnnpack+custom"}
4448
# Default UPLOAD_DIR to empty string if not set
4549
UPLOAD_DIR="${UPLOAD_DIR:-}"
4650

51+
# Default PT2E_QUANTIZE to empty string if not set
52+
PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
53+
4754
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
4855
echo "Expecting atleast 4 positional arguments"
4956
echo "Usage: [...]"
@@ -234,6 +241,10 @@ if [[ "${COREML}" == "ON" ]]; then
234241
fi
235242
if [[ "${QNN}" == "ON" ]]; then
236243
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
244+
echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
245+
if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
246+
EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
247+
fi
237248
fi
238249
# Add dynamically linked library location
239250
$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}

.github/workflows/apple.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ jobs:
4242
4343
build-demo-ios:
4444
name: build-demo-ios
45+
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
46+
if: ${{ !github.event.pull_request.head.repo.fork }}
4547
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
4648
secrets: inherit
4749
with:
@@ -190,6 +192,8 @@ jobs:
190192
) done
191193
192194
upload-frameworks-ios:
195+
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
196+
if: ${{ !github.event.pull_request.head.repo.fork }}
193197
runs-on: ubuntu-22.04
194198
needs: [build-frameworks-ios, set-version]
195199
timeout-minutes: 30
@@ -278,6 +282,8 @@ jobs:
278282
279283
build-benchmark-app:
280284
name: build-benchmark-app
285+
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
286+
if: ${{ !github.event.pull_request.head.repo.fork }}
281287
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
282288
secrets: inherit
283289
with:

.github/workflows/build-wheels-linux.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ jobs:
2727
test-infra-ref: main
2828
with-cuda: disabled
2929
with-rocm: disabled
30+
python-versions: '["3.10", "3.11", "3.12"]'
3031

3132
build:
3233
needs: generate-matrix

.github/workflows/build-wheels-m1.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ jobs:
2727
test-infra-ref: main
2828
with-cuda: disabled
2929
with-rocm: disabled
30+
python-versions: '["3.10", "3.11", "3.12"]'
3031

3132
build:
3233
needs: generate-matrix

.github/workflows/pull.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ jobs:
332332
docker-image: executorch-ubuntu-22.04-clang12
333333

334334
unittest-arm:
335-
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
335+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
336336
with:
337337
runner: linux.2xlarge
338338
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -368,6 +368,7 @@ jobs:
368368
strategy:
369369
matrix:
370370
dtype: [fp32]
371+
pt2e_quantize: [qnn_16a16w, qnn_8a8w]
371372
mode: [qnn]
372373
fail-fast: false
373374
with:
@@ -384,6 +385,7 @@ jobs:
384385
DTYPE=${{ matrix.dtype }}
385386
BUILD_TOOL="cmake"
386387
MODE=${{ matrix.mode }}
388+
PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
387389
388390
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
389391
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
@@ -393,7 +395,7 @@ jobs:
393395
# Install requirements for export_llama
394396
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
395397
# Test llama2
396-
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}"
398+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
397399
398400
test-phi-3-mini-runner-linux:
399401
name: test-phi-3-mini-runner-linux

.github/workflows/trunk.yml

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ jobs:
131131
132132
test-arm-backend-delegation:
133133
name: test-arm-backend-delegation
134-
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
134+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
135135
with:
136136
runner: linux.2xlarge
137137
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -157,7 +157,7 @@ jobs:
157157
158158
test-arm-reference-delegation:
159159
name: test-arm-reference-delegation
160-
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
160+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
161161
with:
162162
runner: linux.2xlarge
163163
docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -351,6 +351,8 @@ jobs:
351351
done
352352
353353
test-huggingface-transformers:
354+
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
355+
if: ${{ !github.event.pull_request.head.repo.fork }}
354356
name: test-huggingface-transformers
355357
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
356358
secrets: inherit
@@ -441,3 +443,39 @@ jobs:
441443
442444
cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
443445
echo "::endgroup::"
446+
447+
448+
test-llama-runner-qnn-linux:
449+
name: test-llama-runner-qnn-linux
450+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
451+
strategy:
452+
matrix:
453+
dtype: [fp32]
454+
pt2e_quantize: [qnn_16a16w, qnn_8a8w]
455+
mode: [qnn]
456+
fail-fast: false
457+
with:
458+
runner: linux.2xlarge
459+
docker-image: executorch-ubuntu-22.04-qnn-sdk
460+
submodules: 'true'
461+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
462+
timeout: 900
463+
script: |
464+
# The generic Linux job chooses to use base env, not the one setup by the image
465+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
466+
conda activate "${CONDA_ENV}"
467+
468+
BUILD_TOOL="cmake"
469+
DTYPE=${{ matrix.dtype }}
470+
MODE=${{ matrix.mode }}
471+
PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
472+
473+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
474+
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
475+
476+
# Setup executorch
477+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
478+
# Install requirements for export_llama
479+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
480+
# Test llama2
481+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
[submodule "third-party/pybind11"]
6565
path = third-party/pybind11
6666
url = https://github.com/pybind/pybind11.git
67+
[submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
68+
path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
69+
url = https://github.com/foss-xtensa/nnlib-FusionG3/
6770
[submodule "third-party/ao"]
6871
path = third-party/ao
6972
url = https://github.com/pytorch/ao.git

backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {
151151
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
152152
XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
153153
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
154-
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
155154
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
156155
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
157156
}

backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ - (void)testMV3ProgramProfiling {
146146
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
147147
XCTAssertNotNil(profilingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
148148
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
149-
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
150149
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
151150
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
152151
};

backends/arm/_passes/cast_int64_pass.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,15 @@
55

66
# pyre-unsafe
77

8+
import logging
9+
810
import torch
11+
from executorch.backends.arm._passes.arm_pass_utils import is_param_node
912
from executorch.exir.pass_base import ExportPass, PassResult
13+
from torch._export.utils import is_buffer
14+
15+
logger = logging.getLogger(__name__)
16+
logger.setLevel(logging.WARNING)
1017

1118

1219
class CastInt64ToInt32Pass(ExportPass):
@@ -18,17 +25,31 @@ def _to_int32(self, graph_module: torch.fx.GraphModule):
1825
for node in graph_module.graph.nodes:
1926
fake_tensor = node.meta["val"]
2027
if isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
21-
if node.meta["val"].dtype == torch.int64:
22-
node.meta["val"] = node.meta["val"].to(torch.int32)
23-
buffer_name = (
24-
self.exported_program.graph_signature.inputs_to_buffers[
25-
node.name
26-
]
27-
)
28-
new_tensor = self.exported_program.state_dict[buffer_name].to(
29-
torch.int32
30-
)
31-
self.exported_program.state_dict[buffer_name] = new_tensor
28+
if node.meta["val"].dtype == torch.int64 and is_param_node(
29+
self.exported_program, node
30+
):
31+
if is_buffer(self.exported_program, node):
32+
node.meta["val"] = node.meta["val"].to(torch.int32)
33+
buffer_name = (
34+
self.exported_program.graph_signature.inputs_to_buffers[
35+
node.name
36+
]
37+
)
38+
buffer = self.exported_program.state_dict[node.name]
39+
logger.warning(
40+
f"Casting buffer {node.name} from torch.int64 to torch.int32"
41+
f" defined in {node.meta['stack_trace']}"
42+
)
43+
if torch.min(buffer) < torch.iinfo(torch.int32).min:
44+
raise RuntimeError(
45+
f"Buffer {node.name} has value < {torch.iinfo(torch.int32).min}"
46+
)
47+
if torch.max(buffer) > torch.iinfo(torch.int32).max:
48+
raise RuntimeError(
49+
f"Buffer {node.name} has value > {torch.iinfo(torch.int32).max}"
50+
)
51+
buffer_int32 = buffer.to(torch.int32)
52+
self.exported_program.state_dict[buffer_name] = buffer_int32
3253

3354
def call(self, graph_module: torch.fx.GraphModule):
3455
self._to_int32(graph_module)

backends/arm/_passes/scalars_to_attribute_pass.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ def call(self, graph_module: GraphModule) -> PassResult:
5151
if isinstance(arg, Node):
5252
new_args.append(arg)
5353
continue
54+
if isinstance(arg, int) and not torch.is_floating_point(
55+
get_first_fake_tensor(n)
56+
):
57+
new_args.append(arg)
58+
continue
5459

5560
prefix = "_tensor_constant_"
5661
get_new_attr_name = get_new_attr_name_with_prefix(prefix)

0 commit comments

Comments
 (0)