Skip to content

Commit cbd4958

Browse files
committed
Update on "[ExecuTorch] Simplify function pointers for apply_ternary_elementwise_fn"
Cleaning up some of the required boilerplate. I updated op_clamp and op_where, but continued to not optimize op_where for size/build time. Ideal usage optimizing for size/build time would look like op_clamp. Differential Revision: [D63790004](https://our.internmc.facebook.com/intern/diff/D63790004/) [ghstack-poisoned]
2 parents b8380d5 + 85ceab0 commit cbd4958

File tree

74 files changed

+2296
-1617
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2296
-1617
lines changed

.ci/docker/ci_commit_pins/pytorch.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
aec9b2ab77389967ef39bb9c10662fd0fe3e185a
1+
5ba404f68775bb06a1125a100687f86b6d6de6a8

.github/workflows/android-perf.yml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,8 @@ jobs:
176176
fi
177177
echo "::endgroup::"
178178
179-
build-llm-demo:
180-
name: build-llm-demo
179+
build-benchmark-app:
180+
name: build-benchmark-app
181181
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
182182
needs: set-parameters
183183
with:
@@ -211,7 +211,7 @@ jobs:
211211
uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
212212
needs:
213213
- set-parameters
214-
- build-llm-demo
214+
- build-benchmark-app
215215
- export-models
216216
strategy:
217217
matrix:
@@ -228,10 +228,6 @@ jobs:
228228
# This is the ARN of ExecuTorch project on AWS
229229
project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
230230
device-pool-arn: ${{ matrix.device }}
231-
# Uploaded to S3 from the previous job, the name of the app comes from the project itself.
232-
# Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
233-
# It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
234-
# one app+flavor that could load and run the model.
235231
android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
236232
android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
237233
# NB: Need to set the default spec here so that it works for periodic too

backends/apple/mps/setup.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp
111111
```
112112

113113
### Profiling:
114-
1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while you're exporting your model.
114+
1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
115115
```bash
116116
cd executorch
117117
python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
118118
```
119-
2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./sdk-etdump.md).
119+
2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
120120
```
121121
./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
122122
```

backends/arm/test/common.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import logging
88
import os
9+
import platform
910
import shutil
1011
import subprocess
1112
import sys
@@ -57,11 +58,17 @@ def pytest_collection_modifyitems(config, items):
5758

5859

5960
def load_libquantized_ops_aot_lib():
61+
so_ext = {
62+
"Darwin": "dylib",
63+
"Linux": "so",
64+
"Windows": "dll",
65+
}.get(platform.system(), None)
66+
6067
find_lib_cmd = [
6168
"find",
6269
"cmake-out-aot-lib",
6370
"-name",
64-
"libquantized_ops_aot_lib.so",
71+
f"libquantized_ops_aot_lib.{so_ext}",
6572
]
6673
res = subprocess.run(find_lib_cmd, capture_output=True)
6774
if res.returncode == 0:

backends/arm/test/ops/test_conv_combos.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
from typing import Tuple
1111

12+
import pytest
13+
1214
import torch
1315
from executorch.backends.arm.test import common
1416
from executorch.backends.arm.test.tester.arm_tester import ArmTester
@@ -311,6 +313,8 @@ def test_block_bottleneck_residual_tosa_MI(self):
311313
model = ComboBlockBottleneckResidual()
312314
self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
313315

316+
# TODO: Investigate flakyness (MLTORCH-307)
317+
@pytest.mark.flaky(reruns=3)
314318
def test_block_bottleneck_residual_tosa_BI(self):
315319
model = ComboBlockBottleneckResidual()
316320
self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())

backends/arm/test/ops/test_split.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,11 @@ def test_split_with_sizes_tosa_MI(self, test_data: test_data_t):
124124
self._test_split_tosa_MI_pipeline(self.SplitWithSizes(), test_data)
125125

126126
@parameterized.expand(Split.test_data)
127-
def test_split_n_out_tosa_MI(self, test_data: test_data_t):
127+
def test_split_one_out_tosa_MI(self, test_data: test_data_t):
128128
self._test_split_tosa_MI_pipeline(self.SplitSingleOut(), test_data)
129+
130+
@parameterized.expand(Split.test_data)
131+
def test_split_two_out_tosa_MI(self, test_data: test_data_t):
129132
self._test_split_tosa_MI_pipeline(self.SplitTwoOut(), test_data)
130133

131134
@parameterized.expand(Split.test_data)

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,11 @@ ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) {
300300
const vTensorPtr t = get_tensor(vref);
301301
ValueRef idx(static_cast<int>(values_.size()));
302302
values_.emplace_back(api::vTensor(*t));
303+
for (SharedObject& sobj : shared_objects_) {
304+
if (sobj.has_user(vref)) {
305+
sobj.add_user(this, idx);
306+
}
307+
}
303308
return idx;
304309
}
305310

@@ -311,6 +316,11 @@ ValueRef ComputeGraph::add_tensor_view(
311316
const vTensorPtr t = get_tensor(vref);
312317
ValueRef idx(static_cast<int>(values_.size()));
313318
values_.emplace_back(api::vTensor(*t, sizes, strides, offset_numel));
319+
for (SharedObject& sobj : shared_objects_) {
320+
if (sobj.has_user(vref)) {
321+
sobj.add_user(this, idx);
322+
}
323+
}
314324
return idx;
315325
}
316326

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,19 @@ class ComputeGraph final {
378378
return values_.at(idx).toString();
379379
}
380380

381+
template <
382+
typename T,
383+
typename std::enable_if<
384+
std::is_integral<T>::value && std::is_signed<T>::value,
385+
int>::type = 0>
386+
T extract_whcn_dim(const ValueRef idx, const int64_t ndim) {
387+
T dim = extract_scalar<T>(idx);
388+
// Normalize dim to account for negative indexing
389+
dim = (dim % ndim + ndim) % ndim;
390+
// Assume original value is NCHW ordering, obtain the WHCN ordering
391+
return ndim - 1 - dim;
392+
}
393+
381394
//
382395
// Utility functions
383396
//

backends/vulkan/runtime/graph/containers/SharedObject.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212

1313
namespace vkcompute {
1414

15+
bool SharedObject::has_user(const ValueRef idx) const {
16+
return std::find(users.begin(), users.end(), idx) != users.end();
17+
}
18+
1519
void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
1620
vTensorPtr t = graph->get_tensor(idx);
1721

backends/vulkan/runtime/graph/containers/SharedObject.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ struct SharedObject {
3131
std::vector<ValueRef> users;
3232
vkapi::Allocation allocation;
3333

34+
bool has_user(const ValueRef idx) const;
3435
void add_user(ComputeGraph* const graph, const ValueRef idx);
3536
void allocate(ComputeGraph* const graph);
3637
void bind_users(ComputeGraph* const graph);
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
14+
15+
${define_required_extensions(DTYPE)}
16+
17+
layout(std430) buffer;
18+
19+
${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
20+
${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
21+
${layout_declare_ubo(B, "ivec3", "tin_limits")}
22+
${layout_declare_ubo(B, "ivec4", "tin_axis_map")}
23+
${layout_declare_ubo(B, "ivec4", "tout_axis_map")}
24+
25+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
26+
27+
layout(constant_id = 3) const int nrepeats = 1;
28+
layout(constant_id = 4) const int repeat_dim = 1;
29+
30+
#include "indexing_utils.h"
31+
32+
void main() {
33+
const ivec3 tin_lpos = ivec3(gl_GlobalInvocationID);
34+
35+
if (any(greaterThanEqual(tin_lpos, tin_limits))) {
36+
return;
37+
}
38+
39+
const VEC4_T intex = load_texel_lpos(tin, tin_lpos, tin_axis_map);
40+
41+
ivec3 tout_lpos = tin_lpos;
42+
tout_lpos[repeat_dim] *= nrepeats;
43+
44+
for (int i = 0; i < nrepeats; ++i, tout_lpos[repeat_dim]++) {
45+
write_texel_lpos(tout, tout_lpos, intex, tout_axis_map);
46+
}
47+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
repeat_interleave:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
STORAGE: texture3d
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
- VALUE: float
9+
shader_variants:
10+
- NAME: repeat_interleave
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10+
11+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h>
12+
13+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
14+
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
15+
16+
namespace vkcompute {
17+
18+
void resize_repeat_interleave_node(
19+
ComputeGraph* graph,
20+
const std::vector<ArgGroup>& args,
21+
const std::vector<ValueRef>& extra_args) {
22+
(void)extra_args;
23+
vTensorPtr out = graph->get_tensor(args[0].refs[0]);
24+
vTensorPtr in = graph->get_tensor(args[1].refs[0]);
25+
26+
const int64_t nrepeats = graph->extract_scalar<int64_t>(extra_args[0]);
27+
int64_t repeat_dim = graph->extract_scalar<int64_t>(extra_args[1]);
28+
29+
std::vector<int64_t> new_sizes = in->sizes();
30+
repeat_dim = normalize(repeat_dim, new_sizes.size());
31+
new_sizes.at(repeat_dim) *= nrepeats;
32+
33+
out->virtual_resize(new_sizes);
34+
}
35+
36+
void add_repeat_interleave_node(
37+
ComputeGraph& graph,
38+
const ValueRef in,
39+
const ValueRef num_repeats,
40+
const ValueRef dim,
41+
const ValueRef out) {
42+
const int32_t nrepeats = graph.extract_scalar<int32_t>(num_repeats);
43+
const int32_t repeat_dim =
44+
graph.extract_whcn_dim<int32_t>(dim, graph.dim_of(in));
45+
46+
VK_CHECK_COND(repeat_dim != graph.packed_dim_of(out));
47+
VK_CHECK_COND(repeat_dim != graph.packed_dim_of(in));
48+
49+
std::string kernel_name = "repeat_interleave";
50+
add_dtype_suffix(kernel_name, graph.dtype_of(out));
51+
52+
const utils::uvec3 global_wg_size = graph.logical_limits_of(in);
53+
const utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
54+
55+
graph.execute_nodes().emplace_back(new ExecuteNode(
56+
graph,
57+
// Shader
58+
VK_KERNEL_FROM_STR(kernel_name),
59+
// Workgroup sizes
60+
global_wg_size,
61+
local_wg_size,
62+
// Inputs and Outputs
63+
{{out, vkapi::MemoryAccessType::WRITE},
64+
{in, vkapi::MemoryAccessType::READ}},
65+
// Parameter buffers
66+
{graph.logical_limits_ubo(in),
67+
graph.axis_map_ubo(in),
68+
graph.axis_map_ubo(out)},
69+
// Specialization Constants
70+
{nrepeats, repeat_dim},
71+
// Resizing Logic
72+
resize_repeat_interleave_node,
73+
{num_repeats, dim}));
74+
}
75+
76+
void repeat_interleave(ComputeGraph& graph, const std::vector<ValueRef>& args) {
77+
int args_i = 0;
78+
const ValueRef in = args[args_i++];
79+
const ValueRef num_repeats = args[args_i++];
80+
const ValueRef dim = args[args_i++];
81+
const ValueRef output_size = args[args_i++];
82+
const ValueRef out = args[args_i++];
83+
84+
// Output size is not used in the kernel
85+
(void)output_size;
86+
87+
add_repeat_interleave_node(graph, in, num_repeats, dim, out);
88+
}
89+
90+
REGISTER_OPERATORS {
91+
VK_REGISTER_OP(aten.repeat_interleave.self_int, repeat_interleave);
92+
}
93+
94+
} // namespace vkcompute
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/backends/vulkan/runtime/api/api.h>
12+
13+
#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
14+
15+
namespace vkcompute {
16+
17+
void add_repeat_interleave_node(
18+
ComputeGraph& graph,
19+
const ValueRef in,
20+
const ValueRef num_repeats,
21+
const ValueRef dim,
22+
const ValueRef out);
23+
24+
} // namespace vkcompute

backends/vulkan/runtime/vk_api/memory/Buffer.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,9 @@ class VulkanBuffer final {
161161

162162
inline void bind_allocation(const Allocation& memory) {
163163
VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
164-
VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
164+
if (!is_copy_) {
165+
VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
166+
}
165167
memory_.allocation = memory.allocation;
166168
}
167169

0 commit comments

Comments
 (0)