Skip to content

Commit 92a9de5

Browse files
committed
Update base for Update on "[Executorch][llm] Enable leveraging ring kv cache via module swap"
This allows us to make some of the attention modules to use sliding window kv cache. Will help enable models like gemma3. Differential Revision: [D73891426](https://our.internmc.facebook.com/intern/diff/D73891426/) [ghstack-poisoned]
2 parents fd426c9 + cd3b53d commit 92a9de5

File tree

187 files changed

+6632
-1319
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

187 files changed

+6632
-1319
lines changed

.ci/scripts/build-qnn-sdk.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ set_up_aot() {
3333
cmake .. \
3434
-DCMAKE_INSTALL_PREFIX=$PWD \
3535
-DEXECUTORCH_BUILD_QNN=ON \
36+
-DANDROID_NATIVE_API_LEVEL=30 \
3637
-DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
3738
-DEXECUTORCH_BUILD_DEVTOOLS=ON \
3839
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \

.ci/scripts/test_model.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ test_model_with_coreml() {
222222

223223
DTYPE=float16
224224

225-
"${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}"
225+
"${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}" --use_partitioner
226226
EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
227227

228228
if [ -n "$EXPORTED_MODEL" ]; then

.github/workflows/_link_check.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
on:
2+
workflow_call:
3+
inputs:
4+
ref:
5+
type: string
6+
required: true
7+
8+
jobs:
9+
lint-urls:
10+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
11+
with:
12+
runner: linux.2xlarge
13+
docker-image: executorch-ubuntu-22.04-linter
14+
submodules: 'none'
15+
fetch-depth: 0
16+
ref: ${{ inputs.ref }}
17+
timeout: 90
18+
script: |
19+
./scripts/lint_urls.sh $(
20+
[ "${{ github.event_name }}" = "pull_request" ] \
21+
&& git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
22+
|| [ "${{ github.event_name }}" = "push" ] \
23+
&& git diff --name-only ${{ github.event.before }} ${{ github.sha }}
24+
)
25+
26+
lint-xrefs:
27+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
28+
with:
29+
runner: linux.2xlarge
30+
docker-image: executorch-ubuntu-22.04-linter
31+
submodules: 'none'
32+
fetch-depth: 0
33+
ref: ${{ inputs.ref }}
34+
timeout: 90
35+
script: |
36+
./scripts/lint_xrefs.sh $(
37+
[ "${{ github.event_name }}" = "pull_request" ] \
38+
&& git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }} \
39+
|| [ "${{ github.event_name }}" = "push" ] \
40+
&& git diff --name-only ${{ github.event.before }} ${{ github.sha }}
41+
)

.github/workflows/lint.yml

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -64,29 +64,10 @@ jobs:
6464
6565
exit $RC
6666
67-
lint-urls:
68-
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
67+
link-check:
68+
uses: ./.github/workflows/_link_check.yml
6969
with:
70-
runner: linux.2xlarge
71-
docker-image: executorch-ubuntu-22.04-linter
72-
submodules: 'none'
73-
fetch-depth: 0
7470
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
75-
timeout: 90
76-
script: |
77-
./scripts/lint_urls.sh
78-
79-
lint-xrefs:
80-
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
81-
with:
82-
runner: linux.2xlarge
83-
docker-image: executorch-ubuntu-22.04-linter
84-
submodules: 'none'
85-
fetch-depth: 0
86-
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
87-
timeout: 90
88-
script: |
89-
./scripts/lint_xrefs.sh
9071

9172
android-java-format:
9273
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main

.github/workflows/nightly.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,9 @@ jobs:
3030
test-infra-ref: main
3131
updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
3232
pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
33+
34+
link-check:
35+
needs: update-pytorch-commit-hash
36+
uses: ./.github/workflows/_link_check.yml
37+
with:
38+
ref: ${{ github.sha }}

Package.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@ let package = Package(
7777
name: "\(key)_dependencies",
7878
dependencies: [.target(name: key)],
7979
path: ".Package.swift/\(key)",
80-
linkerSettings:
80+
linkerSettings: [
81+
.linkedLibrary("c++")
82+
] +
8183
(value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } +
8284
(value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) }
8385
),

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ To get started you can:
5151

5252
- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
5353
- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
54-
- Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md)
54+
- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), and [Llava](examples/models/llava/README.md)
5555

5656
## Feedback and Engagement
5757

backends/apple/coreml/runtime/delegate/coreml_backend_delegate.mm

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,17 @@
8888
ET_LOG(Error, "%s: DataType=%d is not supported", ETCoreMLStrings.delegateIdentifier.UTF8String, (int)tensor.scalar_type());
8989
return std::nullopt;
9090
}
91-
91+
9292
std::vector<ssize_t> strides(tensor.strides().begin(), tensor.strides().end());
9393
std::vector<size_t> shape(tensor.sizes().begin(), tensor.sizes().end());
94+
95+
// If tensor is rank 0, wrap in rank 1
96+
// See https://github.com/apple/coremltools/blob/8.2/coremltools/converters/mil/frontend/torch/exir_utils.py#L73
97+
if (shape.size() == 0) {
98+
shape.push_back(1);
99+
strides.push_back(1);
100+
}
101+
94102
MultiArray::MemoryLayout layout(dataType.value(), std::move(shape), std::move(strides));
95103
switch (argType) {
96104
case ArgType::Input: {
@@ -233,6 +241,12 @@ ModelLoggingOptions get_logging_options(BackendExecutionContext& context) {
233241
std::array<SizesType, kTensorDimensionLimit> new_shape;
234242
for (size_t i = nInputs; i < nInputs + nOutputs; i++) {
235243
Tensor& t = args[i]->toTensor();
244+
// If t has rank 0, do not resize. delegate_args[i] will have rank 1
245+
// because we resized it in get_multi_array
246+
if (t.dim() == 0) {
247+
continue;
248+
}
249+
236250
int rank = delegate_args[i].layout().rank();
237251
assert (rank <= new_shape.size());
238252
for (int d = 0; d < rank; d++) {

backends/apple/coreml/runtime/test/ETCoreMLModelManagerTests.mm

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#import <XCTest/XCTest.h>
1616
#import <executorch/runtime/platform/runtime.h>
1717
#import <model_logging_options.h>
18+
#import <multiarray.h>
19+
20+
using namespace executorchcoreml;
1821

1922
@interface ETCoreMLModelManagerTests : XCTestCase
2023

@@ -110,7 +113,7 @@ - (void)testAddModelExecution {
110113
XCTAssertNotNil(inputs);
111114
MLMultiArray *output = [ETCoreMLTestUtils filledMultiArrayWithShape:inputs[0].shape dataType:inputs[0].dataType repeatedValue:@(0) error:&localError];
112115
NSArray<MLMultiArray *> *args = [inputs arrayByAddingObject:output];
113-
XCTAssertTrue([self.modelManager executeModelWithHandle:handle
116+
XCTAssertTrue([self.modelManager executeModelWithHandle:handle
114117
args:args
115118
loggingOptions:executorchcoreml::ModelLoggingOptions()
116119
eventLogger:nullptr
@@ -148,4 +151,77 @@ - (void)testMulModelExecution {
148151
}
149152
}
150153

154+
// See https://github.com/pytorch/executorch/pull/10465
155+
- (void)testAutoreleasepoolError {
156+
NSURL *modelURL = [self.class bundledResourceWithName:@"add_coreml_all" extension:@"bin"];
157+
NSError *localError = nil;
158+
XCTAssertNotNil(modelURL);
159+
160+
NSData *modelData = [NSData dataWithContentsOfURL:modelURL];
161+
MLModelConfiguration *configuration = [[MLModelConfiguration alloc] init];
162+
configuration.computeUnits = MLComputeUnitsAll;
163+
ModelHandle *modelHandle = [self.modelManager loadModelFromAOTData:modelData
164+
configuration:configuration
165+
error:&localError];
166+
XCTAssert(modelHandle);
167+
168+
ETCoreMLModel *model = [self.modelManager modelWithHandle:modelHandle];
169+
XCTAssert(model);
170+
171+
NSArray<MLMultiArray *> *inputArrays =
172+
[ETCoreMLTestUtils inputsForModel:model repeatedValues:@[@(2), @(3)] error:&localError];
173+
XCTAssert(inputArrays);
174+
175+
std::vector<MultiArray> multiArrays;
176+
multiArrays.reserve(inputArrays.count + model.orderedOutputNames.count);
177+
for (MLMultiArray *array in inputArrays) {
178+
auto dataTypeOpt = to_multiarray_data_type(array.dataType);
179+
XCTAssert(dataTypeOpt.has_value());
180+
auto dataType = dataTypeOpt.value();
181+
182+
std::vector<size_t> dims;
183+
for (NSNumber *n in array.shape) {
184+
dims.push_back(n.unsignedLongValue);
185+
}
186+
187+
std::vector<ssize_t> strides(dims.size());
188+
ssize_t currentStride = 1;
189+
for (NSInteger i = dims.size() - 1; i >= 0; --i) {
190+
strides[i] = currentStride;
191+
currentStride *= dims[i];
192+
}
193+
194+
multiArrays.emplace_back(array.dataPointer,
195+
MultiArray::MemoryLayout(dataType, dims, strides));
196+
}
197+
198+
auto inputLayout = multiArrays[0].layout();
199+
size_t bufferSize = inputLayout.num_bytes();
200+
for (NSUInteger i = 0; i < model.orderedOutputNames.count; ++i) {
201+
multiArrays.emplace_back(calloc(1, bufferSize), inputLayout);
202+
}
203+
// corrupt first input shape to force error
204+
{
205+
auto originalLayout = multiArrays[0].layout();
206+
auto corruptedDims = originalLayout.shape();
207+
corruptedDims[0] += 1;
208+
multiArrays[0] = MultiArray(multiArrays[0].data(),
209+
MultiArray::MemoryLayout(originalLayout.dataType(),
210+
corruptedDims,
211+
originalLayout.strides()));
212+
}
213+
214+
BOOL success = [self.modelManager executeModelWithHandle:modelHandle
215+
argsVec:multiArrays
216+
loggingOptions:ModelLoggingOptions()
217+
eventLogger:nullptr
218+
error:&localError];
219+
XCTAssertFalse(success);
220+
XCTAssertNotNil(localError);
221+
222+
for (size_t i = inputArrays.count; i < multiArrays.size(); ++i) {
223+
free(multiArrays[i].data());
224+
}
225+
}
226+
151227
@end

backends/apple/coreml/scripts/install_requirements.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ SCRIPT_DIR_PATH="$(
1212

1313
# TODO(jathu): remove the need to fetch coremltools to build deps for coreml_executor_runner.
1414
# Keep this version in sync with: pyproject.toml
15-
COREMLTOOLS_VERSION="8.2"
15+
COREMLTOOLS_VERSION="8.3"
1616

1717
red=`tput setaf 1`
1818
green=`tput setaf 2`

backends/apple/mps/setup.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,12 @@ cd executorch
7676
## Run the mv3 generated model using the mps_executor_runner
7777

7878
```bash
79-
./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program
79+
./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_float16_bundled.pte --bundled_program
8080
```
8181

8282
- You should see the following results. Note that no output file will be generated in this example:
8383
```
84-
I 00:00:00.003290 executorch:mps_executor_runner.mm:286] Model file mv3_mps_bundled_fp16.pte is loaded.
84+
I 00:00:00.003290 executorch:mps_executor_runner.mm:286] Model file mv3_mps_float16_bundled.pte is loaded.
8585
I 00:00:00.003306 executorch:mps_executor_runner.mm:292] Program methods: 1
8686
I 00:00:00.003308 executorch:mps_executor_runner.mm:294] Running method forward
8787
I 00:00:00.003311 executorch:mps_executor_runner.mm:349] Setting up non-const buffer 1, size 606112.
@@ -118,7 +118,7 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_
118118
```
119119
2. Run your Program on the ExecuTorch runtime and generate an [ETDump](../../../docs/source/etdump.md).
120120
```
121-
./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
121+
./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_float16_bundled.pte --bundled_program --dump-outputs
122122
```
123123
3. Create an instance of the Inspector API by passing in the ETDump you have sourced from the runtime along with the optionally generated ETRecord from step 1.
124124
```bash

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
)
4040
from .fuse_batchnorm2d_pass import FuseBatchnorm2DPass # noqa
4141
from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass # noqa
42+
from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass # noqa
4243
from .fuse_quantized_activation_pass import FuseQuantizedActivationPass # noqa
4344
from .insert_rescales_pass import InsertRescalePass # noqa
4445
from .insert_table_ops import InsertTableOpsPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
FoldAndAnnotateQParamsPass,
4141
FuseBatchnorm2DPass,
4242
FuseConstantArgsPass,
43+
FuseEqualPlaceholdersPass,
4344
FuseQuantizedActivationPass,
4445
InsertRescalePass,
4546
InsertTableOpsPass,
@@ -58,6 +59,9 @@
5859
)
5960

6061
from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
62+
from executorch.backends.transforms.decompose_sdpa import (
63+
DecomposeScaledDotProductAttention,
64+
)
6165
from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
6266
from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
6367
from executorch.exir import ExportedProgram
@@ -113,6 +117,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
113117
self.add_pass(FuseConstantArgsPass(exported_program))
114118

115119
self.add_pass(InsertTableOpsPass(exported_program))
120+
self.add_pass(FuseEqualPlaceholdersPass(exported_program))
116121
self.add_pass(AnnotateChannelsLastDimOrder())
117122
self.add_pass(InsertRescalePass())
118123

@@ -164,6 +169,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
164169
self.add_pass(FuseViewCopyTransform())
165170
self.add_pass(FuseConstantArgsPass(exported_program))
166171
self.add_pass(InsertTableOpsPass(exported_program))
172+
self.add_pass(FuseEqualPlaceholdersPass(exported_program))
167173
self.add_pass(AnnotateChannelsLastDimOrder())
168174
self.add_pass(InsertRescalePass())
169175

@@ -191,6 +197,7 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
191197
)
192198

193199
def transform_for_annotation_pipeline(self, graph_module: GraphModule):
200+
self.add_pass(DecomposeScaledDotProductAttention())
194201
self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
195202
self.add_pass(ScalarsToAttributePass())
196203
self.add_pass(DecomposeLayerNormPass())

backends/arm/_passes/decompose_softmax_pass.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
from executorch.exir.pass_base import ExportPass
99

1010
# For BI case
11-
torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int)
11+
torch_softmax = (
12+
torch.ops.aten.softmax.int,
13+
torch.ops.aten._safe_softmax.default,
14+
torch.ops.aten.log_softmax.int,
15+
)
1216
# For MI case
1317
edge_softmax = (
1418
exir_ops.edge.aten._softmax.default,

0 commit comments

Comments
 (0)