Skip to content

Commit 789215c

Browse files
committed
[ET-VK][11/n] copy_channel_offsets node
1. Add a node `copy_channel_offsets` specifically for copying along the channel dimension, it needs extra attention at the boundaries due to channel packing. 1.1. `copy_channel_offsets` will be useful for `aten.cat` and `aten.split`. 2. Create `etvk.*` operators to facilitate testing. Add test case for both `copy_offset` and `copy_channel_offset`. Differential Revision: [D56554426](https://our.internmc.facebook.com/intern/diff/D56554426/) ghstack-source-id: 223879986 Pull Request resolved: #3351
1 parent 3341c10 commit 789215c

File tree

6 files changed

+555
-12
lines changed

6 files changed

+555
-12
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_type(DTYPE)}
14+
15+
layout(std430) buffer;
16+
17+
#include "indexing_utils.h"
18+
19+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
20+
layout(set = 0, binding = 1) uniform PRECISION sampler3D existing_out;
21+
layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
22+
23+
layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
24+
ivec3 out_limits;
25+
};
26+
27+
layout(set = 0, binding = 4) uniform PRECISION restrict InLimits {
28+
ivec3 in_limits;
29+
};
30+
31+
layout(set = 0, binding = 5) uniform PRECISION restrict CopyArgs {
32+
ivec4 out_sizes;
33+
ivec4 in_sizes;
34+
// Analogus to range variable in copy. It defines the # of channel being
35+
// copied.
36+
int channel_range;
37+
int src_channel_offset;
38+
int dst_channel_offset;
39+
int unused;
40+
// Operates on (x, y, z) extents.
41+
ivec3 range;
42+
int unused1;
43+
ivec3 dst_offset;
44+
int unused2;
45+
};
46+
47+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
48+
49+
layout(constant_id = 3) const int packed_dim = C_DIM;
50+
51+
void main() {
52+
// pos is [0, range_i),
53+
54+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
55+
if (any(greaterThanEqual(pos, range))) {
56+
return;
57+
}
58+
59+
const ivec3 out_pos = pos + dst_offset;
60+
61+
const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
62+
63+
// First read the existing values to make sure the boundary values stay.
64+
VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));
65+
66+
for (int i=0; i<4; i++) {
67+
ivec4 in_whcn = out_whcn;
68+
69+
in_whcn.z = out_whcn.z - dst_channel_offset + i;
70+
71+
// Handle the partial update for begining of channel in an existing tensor.
72+
// If the source channel index is below zero, we can skip.
73+
if (in_whcn.z < 0) {
74+
continue;
75+
}
76+
77+
// Handle the partial update for end of channels. If the channel index
78+
// exceeds the range, we skip updating the element.
79+
if (in_whcn.z >= channel_range) {
80+
continue;
81+
}
82+
83+
// Readjust for the source offset.
84+
in_whcn.z = in_whcn.z + src_channel_offset;
85+
86+
ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
87+
v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
88+
}
89+
90+
imageStore(image_out, out_pos, v);
91+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
copy_channel_offset:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
- VALUE: float
9+
shader_variants:
10+
- NAME: copy_channel_offset

backends/vulkan/runtime/graph/ops/impl/Copy.cpp

Lines changed: 184 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,42 @@
88

99
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
1010

11+
#include <executorch/backends/vulkan/runtime/api/api.h>
12+
#include <executorch/backends/vulkan/runtime/graph/Logging.h>
13+
14+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
1115
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
1216
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
1317
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
1418

1519
namespace vkcompute {
1620

21+
using api::utils::ivec3;
22+
using api::utils::uvec3;
23+
1724
void add_copy_offset_node(
1825
ComputeGraph& graph,
1926
const ValueRef in,
20-
const api::utils::ivec3& range,
21-
const api::utils::ivec3& src_offset,
22-
const api::utils::ivec3& dst_offset,
27+
const ivec3& range,
28+
const ivec3& src_offset,
29+
const ivec3& dst_offset,
2330
const ValueRef out) {
2431
vTensorPtr t_in = graph.get_tensor(in);
2532
vTensorPtr t_out = graph.get_tensor(out);
2633

27-
VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
28-
VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
29-
3034
std::string kernel_name = "copy_offset";
3135
kernel_name.reserve(kShaderNameReserve);
3236
add_dtype_suffix(kernel_name, *t_out);
3337

34-
api::utils::uvec3 global_size = api::utils::make_uvec3(range);
35-
api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
38+
uvec3 global_size = api::utils::make_uvec3(range);
39+
uvec3 local_size = adaptive_work_group_size(global_size);
3640

3741
const struct Block final {
38-
api::utils::ivec3 range;
42+
ivec3 range;
3943
int32_t unused0;
40-
api::utils::ivec3 src_offset;
44+
ivec3 src_offset;
4145
int32_t unused1;
42-
api::utils::ivec3 dst_offset;
46+
ivec3 dst_offset;
4347
int32_t unused2;
4448
} offset_params{
4549
range,
@@ -58,7 +62,10 @@ void add_copy_offset_node(
5862
global_size,
5963
local_size,
6064
// Inputs and Outputs
61-
{{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
65+
{
66+
{out, api::MemoryAccessType::WRITE},
67+
{in, api::MemoryAccessType::READ},
68+
},
6269
// Parameter buffers
6370
{t_out->texture_limits_ubo(),
6471
t_in->texture_limits_ubo(),
@@ -67,4 +74,169 @@ void add_copy_offset_node(
6774
{}));
6875
}
6976

77+
void add_copy_channel_offset_node(
78+
ComputeGraph& graph,
79+
const ValueRef in,
80+
int32_t channel_range,
81+
int32_t src_channel_offset,
82+
int32_t dst_channel_offset,
83+
const ValueRef out) {
84+
vTensorPtr t_in = graph.get_tensor(in);
85+
vTensorPtr t_out = graph.get_tensor(out);
86+
87+
// Likely need to prepad these numbers.
88+
std::vector<int64_t> in_sizes = t_in->sizes();
89+
std::vector<int64_t> out_sizes = t_in->sizes();
90+
91+
VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
92+
VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
93+
94+
// NOTE: This function should be able to support 1d and 2d tensors when
95+
// range=1, src_offset=dst_offset=1.
96+
VK_CHECK_COND(t_in->dim() >= 3, "Src dim should be at least 3");
97+
VK_CHECK_COND(t_out->dim() >= 3, "Dst dim should be at least 3");
98+
99+
VK_CHECK_COND(
100+
dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
101+
"Source channel and range should be less than or equal to input tensor's channel size");
102+
VK_CHECK_COND(
103+
dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
104+
"Source channel and range should be less than or equal to input tensor's channel size");
105+
106+
VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
107+
VK_CHECK_COND(
108+
src_channel_offset >= 0, "Src channel offset must be non-negative");
109+
VK_CHECK_COND(
110+
dst_channel_offset >= 0, "Dst channel offset must be non-negative");
111+
112+
std::string kernel_name = "copy_channel_offset";
113+
kernel_name.reserve(kShaderNameReserve);
114+
add_dtype_suffix(kernel_name, *t_out);
115+
116+
int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);
117+
118+
for (int batch_idx = 0; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
119+
batch_idx++) {
120+
// Mapping the tensor NCHW coordinates into texture XYZ coordinates
121+
int32_t dst_first_z = dst_channel_offset / 4;
122+
int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;
123+
124+
// We copy the entire width and height dimension. For the batch dimension,
125+
// the global_size variable specify the range. The shader combines it with
126+
// the dst_offset to get the actual coordinate.
127+
128+
ivec3 dst_offset{
129+
0, 0, dst_first_z + batch_idx * api::utils::div_up(out_channels, 4)};
130+
131+
uvec3 global_size{
132+
dim_at<Dim4D::Width>(in_sizes),
133+
dim_at<Dim4D::Height>(in_sizes),
134+
api::utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
135+
136+
uvec3 local_size = adaptive_work_group_size(global_size);
137+
138+
/*
139+
std::cout << "shader channel offset. "
140+
<< " batch_idx=" << batch_idx
141+
<< " channel_range=" << channel_range
142+
<< " src_channel_offset=" << src_channel_offset
143+
<< " dst_channel_offset=" << dst_channel_offset
144+
<< " in_size=" << in_sizes
145+
<< " out_size=" << out_sizes
146+
<< " dst_offset=" << dst_offset
147+
<< " global_size=" << global_size
148+
<< std::endl;
149+
*/
150+
151+
const struct Block final {
152+
api::utils::ivec4 out_sizes;
153+
api::utils::ivec4 in_sizes;
154+
int32_t channel_range;
155+
int32_t src_channel_offset;
156+
int32_t dst_channel_offset;
157+
int32_t unused;
158+
ivec3 range;
159+
int32_t unused1;
160+
ivec3 dst_offset;
161+
int32_t unused2;
162+
163+
} channel_offset_params{
164+
api::utils::make_whcn_ivec4(out_sizes),
165+
api::utils::make_whcn_ivec4(in_sizes),
166+
channel_range,
167+
src_channel_offset,
168+
dst_channel_offset,
169+
0,
170+
api::utils::make_ivec3(global_size),
171+
0,
172+
dst_offset,
173+
};
174+
175+
auto shader = VK_KERNEL_FROM_STR(kernel_name);
176+
177+
graph.execute_nodes().emplace_back(new ExecuteNode(
178+
graph,
179+
VK_KERNEL_FROM_STR(kernel_name),
180+
global_size,
181+
local_size,
182+
// Inputs and Outputs
183+
{
184+
{out, api::MemoryAccessType::WRITE},
185+
{out, api::MemoryAccessType::READ},
186+
{in, api::MemoryAccessType::READ},
187+
},
188+
// Parameter buffers
189+
{t_out->texture_limits_ubo(),
190+
t_in->texture_limits_ubo(),
191+
graph.create_params_buffer(channel_offset_params)},
192+
// Specialization Constants
193+
{}));
194+
}
195+
}
196+
197+
void add_copy_offset_node(
198+
ComputeGraph& graph,
199+
ValueRef in,
200+
ValueRef range_ref,
201+
ValueRef src_offset_ref,
202+
ValueRef dst_offset_ref,
203+
ValueRef out) {
204+
ivec3 range = api::utils::make_ivec3(*graph.get_int_list(range_ref));
205+
ivec3 src_offset =
206+
api::utils::make_ivec3(*graph.get_int_list(src_offset_ref));
207+
ivec3 dst_offset =
208+
api::utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
209+
210+
add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
211+
}
212+
213+
void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
214+
add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
215+
}
216+
217+
void copy_channel_offset(
218+
ComputeGraph& graph,
219+
const std::vector<ValueRef>& args) {
220+
ValueRef in = args[0];
221+
ValueRef channel_range_ref = args[1];
222+
ValueRef src_channel_offset_ref = args[2];
223+
ValueRef dst_channel_offset_ref = args[3];
224+
ValueRef out = args[4];
225+
226+
auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
227+
auto src_channel_offset =
228+
graph.extract_scalar<int64_t>(src_channel_offset_ref);
229+
auto dst_channel_offset =
230+
graph.extract_scalar<int64_t>(dst_channel_offset_ref);
231+
232+
add_copy_channel_offset_node(
233+
graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
234+
}
235+
236+
REGISTER_OPERATORS {
237+
// VK_REGISTER_OP(aten.clone.default, add_test_node);
238+
VK_REGISTER_OP(etvk.copy_offset, copy_offset);
239+
VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
240+
}
241+
70242
} // namespace vkcompute

backends/vulkan/runtime/graph/ops/impl/Copy.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@
1414

1515
namespace vkcompute {
1616

17+
// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
18+
// texture extents specified by the range, src_offset, and dst_offset (all are
19+
// in texture coordinate (x, y, z) from the input image to the output image.
20+
//
21+
// It is possible to have input and output to point to the same image
22+
// object. But when the source range and destination range overlap, the behavior
23+
// is undefined.
1724
void add_copy_offset_node(
1825
ComputeGraph& graph,
1926
const ValueRef in,
@@ -22,4 +29,25 @@ void add_copy_offset_node(
2229
const api::utils::ivec3& dst_offset,
2330
const ValueRef out);
2431

32+
// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
33+
// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
34+
// The range and offset arguments are in the tensor coordinate. It assumes the
35+
// underlying texture is channel-packed.
36+
//
37+
// This function is specialized implementation for copying
38+
// channel packed values. The complication comes from when reading / writing the
39+
// channel dimension on indices that are not aligned to packing, we will need
40+
// be careful about the boundaries.
41+
//
42+
// It achieves the following:
43+
// out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] =
44+
// in [:, src_channel_offset:src_channel_offset + channel_range, :, :]
45+
void add_copy_channel_offset_node(
46+
ComputeGraph& graph,
47+
const ValueRef in,
48+
int32_t channel_range,
49+
int32_t src_channel_offset,
50+
int32_t dst_channel_offset,
51+
const ValueRef out);
52+
2553
} // namespace vkcompute

0 commit comments

Comments
 (0)