Skip to content

Commit 74e10a8

Browse files
committed
Merge branch 'jz/tt-llama-2' into jz/tt-llama-3
2 parents 80aa6d1 + 4a09ff1 commit 74e10a8

File tree

34 files changed

+324
-163
lines changed

34 files changed

+324
-163
lines changed

backends/apple/coreml/test/test_coreml_quantizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515
)
1616

1717
from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
18-
from torch._export import capture_pre_autograd_graph
1918
from torch.ao.quantization.quantize_pt2e import (
2019
convert_pt2e,
2120
prepare_pt2e,
2221
prepare_qat_pt2e,
2322
)
23+
from torch.export import export_for_training
2424

2525

2626
class TestCoreMLQuantizer:
@@ -32,7 +32,7 @@ def quantize_and_compare(
3232
) -> None:
3333
assert quantization_type in {"PTQ", "QAT"}
3434

35-
pre_autograd_aten_dialect = capture_pre_autograd_graph(model, example_inputs)
35+
pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
3636

3737
quantization_config = LinearQuantizerConfig.from_dict(
3838
{

backends/apple/mps/test/test_mps_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,9 @@ def lower_module_and_test_output(
209209

210210
expected_output = model(*sample_inputs)
211211

212-
model = torch._export.capture_pre_autograd_graph(
212+
model = torch.export.export_for_training(
213213
model, sample_inputs, dynamic_shapes=dynamic_shapes
214-
)
214+
).module()
215215

216216
edge_program = export_to_edge(
217217
model,

backends/mediatek/quantizer/annotator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
from typing import Callable, List
88

99
import torch
10-
11-
from torch._export import capture_pre_autograd_graph
1210
from torch._ops import OpOverload
1311
from torch._subclasses import FakeTensor
1412

@@ -17,6 +15,8 @@
1715
_annotate_input_qspec_map,
1816
_annotate_output_qspec,
1917
)
18+
19+
from torch.export import export_for_training
2020
from torch.fx import Graph, Node
2121
from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
2222
SubgraphMatcherWithNameNodeMap,
@@ -159,7 +159,7 @@ def forward(self, x):
159159
return norm, {}
160160

161161
for pattern_cls in (ExecuTorchPattern, MTKPattern):
162-
pattern_gm = capture_pre_autograd_graph(pattern_cls(), (torch.randn(3, 3),))
162+
pattern_gm = export_for_training(pattern_cls(), (torch.randn(3, 3),)).module()
163163
matcher = SubgraphMatcherWithNameNodeMap(
164164
pattern_gm, ignore_literals=True, remove_overlapping_matches=False
165165
)

backends/transforms/test/test_duplicate_dynamic_quant_chain.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import unittest
99

1010
import torch
11-
import torch._export as export
1211
from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
1312
DuplicateDynamicQuantChainPass,
1413
)
@@ -59,10 +58,10 @@ def _test_duplicate_chain(
5958

6059
# program capture
6160
m = copy.deepcopy(m_eager)
62-
m = export.capture_pre_autograd_graph(
61+
m = torch.export.export_for_training(
6362
m,
6463
example_inputs,
65-
)
64+
).module()
6665

6766
m = prepare_pt2e(m, quantizer)
6867
# Calibrate

backends/vulkan/README.md

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -78,36 +78,47 @@ currently in development:
7878
## End to End Example
7979

8080
To further understand the features of the Vulkan Delegate and how to use it,
81-
consider the following end to end example with MobileNet V2.
81+
consider the following end to end example with a simple single operator model.
8282

8383
### Compile and lower a model to the Vulkan Delegate
8484

8585
Assuming ExecuTorch has been set up and installed, the following script can be
8686
used to produce a lowered MobileNet V2 model as `vulkan_mobilenetv2.pte`.
8787

88+
Once ExecuTorch has been set up and installed, the following script can be used
89+
to generate a simple model and lower it to the Vulkan delegate.
90+
8891
```
92+
# Note: this script is the same as the script from the "Setting up ExecuTorch"
93+
# page, with one minor addition to lower to the Vulkan backend.
8994
import torch
90-
import torchvision.models as models
95+
from torch.export import export
96+
from executorch.exir import to_edge
9197
92-
from torch.export import export, ExportedProgram
93-
from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
9498
from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
95-
from executorch.exir import EdgeProgramManager, ExecutorchProgramManager, to_edge
96-
from executorch.exir.backend.backend_api import to_backend
9799
98-
mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
99-
sample_inputs = (torch.randn(1, 3, 224, 224), )
100+
# Start with a PyTorch model that adds two input tensors (matrices)
101+
class Add(torch.nn.Module):
102+
def __init__(self):
103+
super(Add, self).__init__()
104+
105+
def forward(self, x: torch.Tensor, y: torch.Tensor):
106+
return x + y
100107
101-
exported_program: ExportedProgram = export(mobilenet_v2, sample_inputs)
102-
edge: EdgeProgramManager = to_edge(exported_program)
108+
# 1. torch.export: Defines the program with the ATen operator set.
109+
aten_dialect = export(Add(), (torch.ones(1), torch.ones(1)))
103110
104-
# Lower the model to Vulkan backend
105-
edge = edge.to_backend(VulkanPartitioner())
111+
# 2. to_edge: Make optimizations for Edge devices
112+
edge_program = to_edge(aten_dialect)
113+
# 2.1 Lower to the Vulkan backend
114+
edge_program = edge_program.to_backend(VulkanPartitioner())
106115
107-
exec_prog = edge.to_executorch()
116+
# 3. to_executorch: Convert the graph to an ExecuTorch program
117+
executorch_program = edge_program.to_executorch()
108118
109-
with open("vulkan_mobilenetv2.pte", "wb") as file:
110-
exec_prog.write_to_file(file)
119+
# 4. Save the compiled .pte program
120+
with open("vk_add.pte", "wb") as file:
121+
file.write(executorch_program.buffer)
111122
```
112123

113124
Like other ExecuTorch delegates, a model can be lowered to the Vulkan Delegate
@@ -122,29 +133,31 @@ will be executed on the GPU.
122133

123134

124135
::::{note}
125-
The [Vulkan partitioner code](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/vulkan_partitioner.py)
126-
can be inspected to examine which ops are currently implemented in the Vulkan
127-
delegate.
136+
The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/partitioner/supported_ops.py)
137+
Vulkan partitioner code can be inspected to examine which ops are currently
138+
implemented in the Vulkan delegate.
128139
::::
129140

130141
### Build Vulkan Delegate libraries
131142

132143
The easiest way to build and test the Vulkan Delegate is to build for Android
133144
and test on a local Android device. Android devices have built in support for
134-
Vulkan, and the Android NDK ships with a GLSL compiler, which is needed to
145+
Vulkan, and the Android NDK ships with a GLSL compiler which is needed to
135146
compile the Vulkan Compute Library's GLSL compute shaders.
136147

137148
The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON`
138149
when building with CMake.
139150

140-
First, make sure that you have the Android NDK installed - Android NDK 26.3.11579264 is
141-
recommended. The Android SDK should also be installed so that you have access
142-
to `adb`.
151+
First, make sure that you have the Android NDK installed; any NDK version past
152+
NDK r19c should work. Note that the examples in this doc have been validated with
153+
NDK r25. The Android SDK should also be installed so that you have access to `adb`.
154+
155+
The instructions in this page assumes that the following environment variables
156+
are set.
143157

144158
```shell
145-
# Recommended version is Android NDK 26.3.11579264.
146159
export ANDROID_NDK=<path_to_ndk>
147-
# Select an appropriate Android ABI
160+
# Select the appropriate Android ABI for your device
148161
export ANDROID_ABI=arm64-v8a
149162
# All subsequent commands should be performed from ExecuTorch repo root
150163
cd <path_to_executorch_root>
@@ -183,10 +196,10 @@ GPU!
183196
cmake --build cmake-android-out --target vulkan_executor_runner -j32
184197

185198
# Push model to device
186-
adb push vulkan_mobilenetv2.pte /data/local/tmp/vulkan_mobilenetv2.pte
199+
adb push vk_add.pte /data/local/tmp/vk_add.pte
187200
# Push binary to device
188201
adb push cmake-android-out/backends/vulkan/vulkan_executor_runner /data/local/tmp/runner_bin
189202

190203
# Run the model
191-
adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vulkan_mobilenetv2.pte
204+
adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vk_add.pte
192205
```

backends/vulkan/docs/android_demo.md

Lines changed: 40 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ is a native GPU delegate for ExecuTorch.
77
::::{grid} 2
88
:::{grid-item-card} What you will learn in this tutorial:
99
:class-card: card-content
10-
* How to export the Stories 110M parameter model with partial GPU delegation
10+
* How to export the Llama3.2-1B parameter model with partial GPU delegation
1111
* How to execute the partially delegated model on Android
1212
:::
1313
:::{grid-item-card} Prerequisites:
1414
:class-card: card-prerequisites
1515
* Follow [**Setting up ExecuTorch**](./getting-started-setup.md)
16-
* Follow [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
16+
* It is also recommended that you read through [**ExecuTorch Vulkan Delegate**](./native-delegates-executorch-vulkan-delegate.md) and follow the example in that page
1717
:::
1818
::::
1919

@@ -23,65 +23,55 @@ Note that all the steps below should be performed from the ExecuTorch repository
2323
root directory, and assumes that you have gone through the steps of setting up
2424
ExecuTorch.
2525

26-
You should also refer to the **Prerequisites** section of the [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
27-
Tutorial in order to install the specified versions of the Android NDK and the
28-
Android SDK.
26+
It is also assumed that the Android NDK and Android SDK is installed, and the
27+
following environment examples are set.
2928

3029
```shell
31-
# Recommended version is Android NDK 26.3.11579264.
3230
export ANDROID_NDK=<path_to_ndk>
33-
# Select an appropriate Android ABI
31+
# Select an appropriate Android ABI for your device
3432
export ANDROID_ABI=arm64-v8a
3533
# All subsequent commands should be performed from ExecuTorch repo root
3634
cd <path_to_executorch_root>
3735
# Make sure adb works
3836
adb --version
3937
```
4038

41-
## Lowering the Stories 110M model to Vulkan
39+
## Lowering the Llama3.2-1B model to Vulkan
4240

4341
::::{note}
4442
The resultant model will only be partially delegated to the Vulkan backend. In
4543
particular, only binary arithmetic operators (`aten.add`, `aten.sub`,
46-
`aten.mul`, `aten.div`) and the matrix multiplication operator (`aten.mm`) will
47-
be executed on the GPU via the Vulkan delegate. The rest of the model will be
48-
executed using Portable operators. This is because the Vulkan delegate is still
49-
early in development and currently has limited operator coverage.
50-
::::
51-
52-
First, download `stories110M.pt` and `tokenizer.model` from Github:
53-
54-
```shell
55-
wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
56-
wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
57-
```
58-
59-
Next, create the params file:
44+
`aten.mul`, `aten.div`), matrix multiplication operators (`aten.mm`, `aten.bmm`),
45+
and linear layers (`aten.linear`) will be executed on the GPU via the Vulkan
46+
delegate. The rest of the model will be executed using Portable operators.
6047

61-
```shell
62-
echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
63-
```
64-
65-
Then, create a tokenizer binary file:
48+
Operator support for LLaMA models is currently in active development; please
49+
check out the `main` branch of the ExecuTorch repo for the latest capabilities.
50+
::::
6651

67-
```shell
68-
python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
69-
```
52+
First, obtain the `consolidated.00.pth`, `params.json` and `tokenizer.model`
53+
files for the `Llama3.2-1B` model from the [Llama website](https://www.llama.com/llama-downloads/).
7054

71-
Finally, export the `stories110M.pt` file into an ExecuTorch program:
55+
Once the files have been downloaded, the `export_llama` script can be used to
56+
partially lower the Llama model to Vulkan.
7257

7358
```shell
74-
python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json --vulkan
59+
# The files will usually be downloaded to ~/.llama
60+
python -m examples.models.llama2.export_llama \
61+
--disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
62+
-c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
63+
-p ~/.llama/checkpoints/Llama3.2-1B/params.json \
64+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
7565
```
7666

77-
A `vulkan_llama2.pte` file should have been created as a result of the last step.
67+
A `vulkan_llama2.pte` file should have been created as a result of running the
68+
script.
7869

7970
Push the tokenizer binary and `vulkan_llama2.pte` onto your Android device:
8071

8172
```shell
82-
adb mkdir /data/local/tmp/llama/
83-
adb push tokenizer.bin /data/local/tmp/llama/
84-
adb push vulkan_llama2.pte /data/local/tmp/llama/
73+
adb push ~/.llama/tokenizer.model /data/local/tmp/
74+
adb push vulkan_llama2.pte /data/local/tmp/
8575
```
8676

8777
## Build and Run the LLaMA runner binary on Android
@@ -98,7 +88,8 @@ binary using the Android NDK toolchain.
9888
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
9989
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
10090
-DEXECUTORCH_BUILD_VULKAN=ON \
101-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
91+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
92+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
10293
-DPYTHON_EXECUTABLE=python \
10394
-Bcmake-android-out && \
10495
cmake --build cmake-android-out -j16 --target install)
@@ -108,42 +99,28 @@ binary using the Android NDK toolchain.
10899
cmake examples/models/llama2 \
109100
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
110101
-DANDROID_ABI=$ANDROID_ABI \
102+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
103+
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
111104
-DCMAKE_INSTALL_PREFIX=cmake-android-out \
112105
-DPYTHON_EXECUTABLE=python \
113106
-Bcmake-android-out/examples/models/llama2 && \
114107
cmake --build cmake-android-out/examples/models/llama2 -j16)
115108
```
116109

117-
Finally, push and run the llama runner binary on your Android device.
110+
Finally, push and run the llama runner binary on your Android device. Note that
111+
your device must have sufficient GPU memory to execute the model.
118112

119113
```shell
120114
adb push cmake-android-out/examples/models/llama2/llama_main /data/local/tmp/llama_main
121115

122116
adb shell /data/local/tmp/llama_main \
123-
--model_path=/data/local/tmp/llama/vulkan_llama2.pte \
124-
--tokenizer_path=/data/local/tmp/llama/tokenizer.bin \
125-
--prompt "hi" \--temperature=0
126-
```
127-
128-
The following output will be produced:
129-
130-
```
131-
hippo named Hippy lived in a big pond. Hippy was a very happy hippo. He liked to play...
132-
```
133-
134-
## Running with the LLaMA Android Demo App
135-
136-
It is also possible to run the partially delegated Vulkan model inside the LLaMA
137-
Android demo app.
138-
139-
First, make some modifications to the Android app setup script to make sure that
140-
the Vulkan backend is built when building and installing ExecuTorch libraries:
141-
142-
```shell
143-
# Run from executorch root directory. You can also edit this in a code editor
144-
sed -i 's/-DEXECUTORCH_BUILD_XNNPACK=ON/-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_VULKAN=ON/g' examples/demo-apps/android/LlamaDemo/setup.sh
117+
--model_path=/data/local/tmp/vulkan_llama2.pte \
118+
--tokenizer_path=/data/local/tmp/tokenizer.model \
119+
--prompt "Hello"
145120
```
146121

147-
Then, Follow the instructions at [**Setting up the ExecuTorch LLaMA Android Demo App**](./llm/llama-demo-android.md)
148-
to build and run the demo application on your Android device. Once the app
149-
starts up, you can load and run the `vulkan_llama2.pte` model with the app.
122+
Note that currently model inference will be very slow due to the high amount of
123+
delegate blobs in the lowered graph, which requires a transfer to and from the
124+
GPU for each sub graph. Performance is expected to improve drastically as more
125+
of the model can be lowered to the Vulkan delegate, and techniques such as
126+
quantization are supported.

0 commit comments

Comments
 (0)