Skip to content

Commit 8c9f7d8

Browse files
committed
Update on "[ET-VK][10/n] copy node, aten.repeat"
1. Introduce a `CopyNode` for generic copy-with-offset operations. 2. `aten.repeat` on all dimensions. 2.1 Use `CopyNode` where possible. 2.2. Specialized `repeat_channel` shader to handle packings 3. Update codegen to support `Methods` variant only operations. Need a new route to trigger the dispatch. Differential Revision: [D56499329](https://our.internmc.facebook.com/intern/diff/D56499329/) [ghstack-poisoned]
2 parents 2130e97 + 21ebcf7 commit 8c9f7d8

File tree

14 files changed

+171
-144
lines changed

14 files changed

+171
-144
lines changed

.github/workflows/android.yml

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -48,23 +48,12 @@ jobs:
4848
# Build Android demo app
4949
bash build/test_android_ci.sh
5050
51-
# Strip libraries for uploda
52-
strip cmake-out-android-arm64-v8a/lib/*.a cmake-out-android-arm64-v8a/extension/android/*.so
53-
strip cmake-out-android-x86_64/lib/*.a cmake-out-android-x86_64/extension/android/*.so
54-
5551
mkdir -p artifacts-to-be-uploaded
56-
mkdir -p artifacts-to-be-uploaded/arm64-v8a/
57-
mkdir -p artifacts-to-be-uploaded/x86_64/
58-
# Copy the jar to S3
59-
cp extension/android/build/libs/executorch.jar artifacts-to-be-uploaded/
6052
# Copy the app and its test suite to S3
6153
cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/
6254
cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/
63-
# Also copy the libraries
64-
cp cmake-out-android-arm64-v8a/lib/*.a artifacts-to-be-uploaded/arm64-v8a/
65-
cp cmake-out-android-arm64-v8a/extension/android/*.so artifacts-to-be-uploaded/arm64-v8a/
66-
cp cmake-out-android-x86_64/lib/*.a artifacts-to-be-uploaded/x86_64/
67-
cp cmake-out-android-x86_64/extension/android/*.so artifacts-to-be-uploaded/x86_64/
55+
# Also copy the share libraries
56+
cp cmake-out-android/lib/*.a artifacts-to-be-uploaded/
6857
6958
# Upload the app and its test suite to S3 so that they can be downloaded by the test job
7059
upload-artifacts:

backends/vulkan/runtime/api/Tensor.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,8 @@ class vTensor final {
220220
*/
221221
const api::BufferBindInfo texture_limits_ubo();
222222

223-
inline const vTensor::TextureLimits texture_limits() const {
224-
return texture_limits_;
223+
inline const api::utils::ivec3 texture_limits() const {
224+
return texture_limits_.limits;
225225
}
226226

227227
inline size_t numel() const {

backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,12 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
2020
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
2121

2222
layout(set = 0, binding = 2) uniform PRECISION restrict RepeatArgs {
23-
// With input of size (n, c_i, h, w) and repeat r
23+
// With input_size (n, c_i, h, w) and repeat r
2424
// out_size == (n, c_i * r, h, w)
2525
ivec4 out_sizes;
2626
ivec4 in_sizes;
2727
};
2828

29-
30-
3129
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
3230

3331
layout(constant_id = 3) const int packed_dim = C_DIM;
@@ -58,4 +56,3 @@ void main() {
5856

5957
imageStore(image_out, out_pos, v);
6058
}
61-

backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,3 @@ repeat_channel:
88
- VALUE: float
99
shader_variants:
1010
- NAME: repeat_channel
11-

backends/vulkan/runtime/graph/ops/impl/Copy.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,6 @@
1111
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
1212

1313
#include <executorch/backends/vulkan/runtime/api/api.h>
14-
#include <executorch/backends/vulkan/runtime/graph/Logging.h>
15-
16-
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
17-
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
18-
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
19-
20-
#include <iostream>
2114

2215
namespace vkcompute {
2316

backends/vulkan/runtime/graph/ops/impl/Repeat.cpp

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,13 @@
88

99
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
1010

11-
#include <executorch/backends/vulkan/runtime/api/api.h>
12-
#include <executorch/backends/vulkan/runtime/graph/Logging.h>
13-
1411
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
1512
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
1613
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
1714
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
1815

1916
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
2017

21-
#include <iostream>
22-
2318
namespace vkcompute {
2419

2520
namespace {
@@ -137,12 +132,12 @@ void add_repeat_node(
137132
// After expanding a dimension, we will update the "running_range" since we
138133
// will need to copy the "expanded" area.
139134

140-
api::utils::ivec3 running_range = t_in->texture_limits().limits;
135+
api::utils::ivec3 running_range = t_in->texture_limits();
141136

142137
const std::vector<int64_t>& in_sizes = t_in->sizes();
143138

144-
// We use channel packing, repeating the channel dimension is the most
145-
// complicated and time-consuming, since we need to reason over misaligned
139+
// Since we use channel packing, repeating the channel dimension is the most
140+
// complicated and time-consuming, as we need to reason over misaligned
146141
// channels. Hence we expand it first to minimize cost. Also, in this first
147142
// dimension, we copy over the input texure to the output. In subsequent
148143
// dimensions, we read and write from the same tensor.
@@ -159,12 +154,12 @@ void add_repeat_node(
159154
add_repeat_channel_node(graph, in, channel_repeat, out, running_range);
160155
}
161156

157+
// TODO: refactor width, height, and batch into a common helper function.
162158
// Width
163159
if (int64_t width_repeat = dim_at<Dim4D::Width>(repeats); width_repeat > 1) {
164160
api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
165-
// api::utils::ivec3 range = t_in->texture_limits().limits;
166161

167-
for (int i = 1; i < width_repeat; i++) {
162+
for (int i = 1; i < width_repeat; ++i) {
168163
api::utils::ivec3 dst_offset = api::utils::make_ivec3(
169164
{i * dim_at<Dim4D::Width>(in_sizes), 0, 0}, false);
170165

@@ -180,7 +175,7 @@ void add_repeat_node(
180175
height_repeat > 1) {
181176
api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
182177

183-
for (int i = 1; i < height_repeat; i++) {
178+
for (int i = 1; i < height_repeat; ++i) {
184179
api::utils::ivec3 dst_offset = api::utils::make_ivec3(
185180
{0, i * dim_at<Dim4D::Height>(in_sizes), 0}, false);
186181

@@ -195,7 +190,7 @@ void add_repeat_node(
195190
if (int64_t batch_repeat = dim_at<Dim4D::Batch>(repeats); batch_repeat > 1) {
196191
api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
197192

198-
for (int i = 1; i < batch_repeat; i++) {
193+
for (int i = 1; i < batch_repeat; ++i) {
199194
api::utils::ivec3 dst_offset =
200195
api::utils::make_ivec3({0, 0, i * running_range.data[2]}, false);
201196

backends/vulkan/test/op_tests/utils/codegen.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,7 @@
3333

3434
from torchgen.gen import generate_static_dispatch_backend_call, translate_args
3535

36-
from torchgen.gen_aoti_c_shim import (
37-
gen_aoti_c_shim,
38-
gen_static_dispatch_backend_call_signature,
39-
get_backend_index_for_aoti,
40-
)
36+
from torchgen.gen_aoti_c_shim import gen_static_dispatch_backend_call_signature
4137
from torchgen.model import NativeFunction, Variant
4238

4339
##################################

build/test_android_ci.sh

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,18 @@
88
set -ex
99

1010
# https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/ExecuTorchDemo
11-
export_model() {
11+
build_executorch() {
1212
MODEL_NAME=dl3
1313
# Delegating DeepLab v3 to XNNPACK backend
1414
python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate
1515

1616
ASSETS_DIR=examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
1717
mkdir -p "${ASSETS_DIR}"
1818
cp "${MODEL_NAME}_xnnpack_fp32.pte" "${ASSETS_DIR}"
19-
}
2019

21-
build_android_native_library() {
22-
pushd examples/demo-apps/android/LlamaDemo
23-
CMAKE_OUT="cmake-out-android-$1" ANDROID_NDK=/opt/ndk ANDROID_ABI="$1" ./gradlew setup
24-
popd
20+
rm -rf cmake-out && mkdir cmake-out
21+
ANDROID_NDK=/opt/ndk BUCK2=$(which buck2) FLATC=$(which flatc) ANDROID_ABI=arm64-v8a \
22+
bash examples/demo-apps/android/ExecuTorchDemo/setup.sh
2523
}
2624

2725
build_android_demo_app() {
@@ -32,13 +30,12 @@ build_android_demo_app() {
3230

3331
build_android_llama_demo_app() {
3432
pushd examples/demo-apps/android/LlamaDemo
33+
ANDROID_NDK=/opt/ndk ANDROID_ABI=arm64-v8a ./gradlew setup
3534
ANDROID_HOME=/opt/android/sdk ./gradlew build
3635
ANDROID_HOME=/opt/android/sdk ./gradlew assembleAndroidTest
3736
popd
3837
}
3938

40-
build_android_native_library arm64-v8a
41-
build_android_native_library x86_64
42-
export_model
39+
build_executorch
4340
build_android_demo_app
4441
build_android_llama_demo_app

0 commit comments

Comments
 (0)