Skip to content

Commit a40d989

Browse files
committed
[ET-VK][11/n] copy_channel_offsets node
Pull Request resolved: #3351 1. Add a node `copy_channel_offsets` specifically for copying along the channel dimension, it needs extra attention at the boundaries due to channel packing. 1.1. `copy_channel_offsets` will be useful for `aten.cat` and `aten.split`. 2. Create `etvk.*` operators to facilitate testing. Add test case for both `copy_offset` and `copy_channel_offset`. Differential Revision: [D56554426](https://our.internmc.facebook.com/intern/diff/D56554426/) ghstack-source-id: 224194777
1 parent e5471a5 commit a40d989

File tree

7 files changed

+578
-30
lines changed

7 files changed

+578
-30
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_type(DTYPE)}
14+
15+
layout(std430) buffer;
16+
17+
#include "indexing_utils.h"
18+
19+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
20+
layout(set = 0, binding = 1) uniform PRECISION sampler3D existing_out;
21+
layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
22+
23+
layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs {
24+
ivec4 out_sizes;
25+
ivec4 in_sizes;
26+
// Analogus to range variable in copy. It defines the # of channel being
27+
// copied.
28+
int channel_range;
29+
int src_channel_offset;
30+
int dst_channel_offset;
31+
int unused;
32+
// Operates on (x, y, z) extents.
33+
ivec3 range;
34+
int unused1;
35+
ivec3 dst_offset;
36+
int unused2;
37+
};
38+
39+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
40+
41+
layout(constant_id = 3) const int packed_dim = C_DIM;
42+
43+
void main() {
44+
// Note: Unlike other shaders, the range is often not equal to the destination
45+
// texture extent.
46+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
47+
if (any(greaterThanEqual(pos, range))) {
48+
return;
49+
}
50+
51+
const ivec3 out_pos = pos + dst_offset;
52+
53+
const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
54+
55+
// First read the existing values to make sure the boundary values stay.
56+
VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));
57+
58+
for (int i=0; i<4; i++) {
59+
ivec4 in_whcn = out_whcn;
60+
61+
in_whcn.z = out_whcn.z - dst_channel_offset + i;
62+
63+
// Handle the partial update for begining of channel in an existing tensor.
64+
// If the source channel index is below zero or exceeds the range, we skip
65+
// updating the element to avoid overwriting existing data.
66+
if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) {
67+
continue;
68+
}
69+
70+
// Readjust for the source offset.
71+
in_whcn.z = in_whcn.z + src_channel_offset;
72+
73+
ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
74+
v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
75+
}
76+
77+
imageStore(image_out, out_pos, v);
78+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
copy_channel_offset:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
- VALUE: float
9+
shader_variants:
10+
- NAME: copy_channel_offset

backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,26 +10,12 @@
1010

1111
#define PRECISION ${PRECISION}
1212

13-
#define VEC4_T ${texel_type(DTYPE)}
14-
1513
layout(std430) buffer;
1614

17-
#include "indexing_utils.h"
18-
1915
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
2016
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
2117

22-
layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
23-
ivec3 out_limits;
24-
};
25-
26-
layout(set = 0, binding = 3) uniform PRECISION restrict InLimits {
27-
ivec3 in_limits;
28-
};
29-
30-
31-
32-
layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs {
18+
layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
3319
ivec3 range;
3420
int unused0;
3521
ivec3 src_offset;

backends/vulkan/runtime/graph/ops/impl/Copy.cpp

Lines changed: 172 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,42 @@
88

99
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
1010

11+
#include <executorch/backends/vulkan/runtime/api/api.h>
12+
#include <executorch/backends/vulkan/runtime/graph/Logging.h>
13+
14+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
1115
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
1216
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
1317
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
1418

1519
namespace vkcompute {
1620

21+
using api::utils::ivec3;
22+
using api::utils::uvec3;
23+
1724
void add_copy_offset_node(
1825
ComputeGraph& graph,
1926
const ValueRef in,
20-
const api::utils::ivec3& range,
21-
const api::utils::ivec3& src_offset,
22-
const api::utils::ivec3& dst_offset,
27+
const ivec3& range,
28+
const ivec3& src_offset,
29+
const ivec3& dst_offset,
2330
const ValueRef out) {
2431
vTensorPtr t_in = graph.get_tensor(in);
2532
vTensorPtr t_out = graph.get_tensor(out);
2633

27-
VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
28-
VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
29-
3034
std::string kernel_name = "copy_offset";
3135
kernel_name.reserve(kShaderNameReserve);
3236
add_dtype_suffix(kernel_name, *t_out);
3337

34-
api::utils::uvec3 global_size = api::utils::make_uvec3(range);
35-
api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
38+
uvec3 global_size = api::utils::make_uvec3(range);
39+
uvec3 local_size = adaptive_work_group_size(global_size);
3640

3741
const struct Block final {
38-
api::utils::ivec3 range;
42+
ivec3 range;
3943
int32_t unused0;
40-
api::utils::ivec3 src_offset;
44+
ivec3 src_offset;
4145
int32_t unused1;
42-
api::utils::ivec3 dst_offset;
46+
ivec3 dst_offset;
4347
int32_t unused2;
4448
} offset_params{
4549
range,
@@ -58,13 +62,166 @@ void add_copy_offset_node(
5862
global_size,
5963
local_size,
6064
// Inputs and Outputs
61-
{{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
65+
{
66+
{out, api::MemoryAccessType::WRITE},
67+
{in, api::MemoryAccessType::READ},
68+
},
6269
// Parameter buffers
63-
{t_out->texture_limits_ubo(),
64-
t_in->texture_limits_ubo(),
65-
graph.create_params_buffer(offset_params)},
70+
{graph.create_params_buffer(offset_params)},
6671
// Specialization Constants
6772
{}));
6873
}
6974

75+
void add_copy_channel_offset_node(
76+
ComputeGraph& graph,
77+
const ValueRef in,
78+
int32_t channel_range,
79+
int32_t src_channel_offset,
80+
int32_t dst_channel_offset,
81+
const ValueRef out) {
82+
vTensorPtr t_in = graph.get_tensor(in);
83+
vTensorPtr t_out = graph.get_tensor(out);
84+
85+
// Likely need to prepad these numbers.
86+
std::vector<int64_t> in_sizes = t_in->sizes();
87+
std::vector<int64_t> out_sizes = t_out->sizes();
88+
89+
VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
90+
VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
91+
92+
// NOTE: This function should be able to support 1d and 2d tensors when
93+
// range=1, src_offset=dst_offset=1.
94+
VK_CHECK_COND(t_in->dim() >= 3, "Src dim should be at least 3");
95+
VK_CHECK_COND(t_out->dim() >= 3, "Dst dim should be at least 3");
96+
97+
VK_CHECK_COND(
98+
dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
99+
"Source channel plus range should be less than or equal to input tensor's channel size");
100+
VK_CHECK_COND(
101+
dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
102+
"Source channel and range should be less than or equal to input tensor's channel size");
103+
104+
VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
105+
VK_CHECK_COND(
106+
src_channel_offset >= 0, "Src channel offset must be non-negative");
107+
VK_CHECK_COND(
108+
dst_channel_offset >= 0, "Dst channel offset must be non-negative");
109+
110+
std::string kernel_name = "copy_channel_offset";
111+
kernel_name.reserve(kShaderNameReserve);
112+
add_dtype_suffix(kernel_name, *t_out);
113+
114+
int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);
115+
116+
// Copy one batch at a time.
117+
for (int batch_idx = 0; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
118+
batch_idx++) {
119+
// Mapping the tensor NCHW coordinates into texture XYZ coordinates
120+
int32_t dst_first_z = dst_channel_offset / 4;
121+
int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;
122+
123+
// We copy the entire width and height dimension. For the channel dimension,
124+
// we use the z-dimension of the global_size to specify the texture range.
125+
// The shader combines the global invocation id and the dst_offset to get
126+
// the actual coordinate.
127+
128+
ivec3 dst_offset{
129+
0, 0, dst_first_z + batch_idx * api::utils::div_up(out_channels, 4)};
130+
131+
uvec3 global_size{
132+
dim_at<Dim4D::Width>(in_sizes),
133+
dim_at<Dim4D::Height>(in_sizes),
134+
api::utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
135+
136+
uvec3 local_size = adaptive_work_group_size(global_size);
137+
138+
const struct Block final {
139+
api::utils::ivec4 out_sizes;
140+
api::utils::ivec4 in_sizes;
141+
int32_t channel_range;
142+
int32_t src_channel_offset;
143+
int32_t dst_channel_offset;
144+
int32_t unused;
145+
ivec3 range;
146+
int32_t unused1;
147+
ivec3 dst_offset;
148+
int32_t unused2;
149+
150+
} channel_offset_params{
151+
api::utils::make_whcn_ivec4(out_sizes),
152+
api::utils::make_whcn_ivec4(in_sizes),
153+
channel_range,
154+
src_channel_offset,
155+
dst_channel_offset,
156+
0,
157+
api::utils::make_ivec3(global_size),
158+
0,
159+
dst_offset,
160+
0,
161+
};
162+
163+
auto shader = VK_KERNEL_FROM_STR(kernel_name);
164+
165+
graph.execute_nodes().emplace_back(new ExecuteNode(
166+
graph,
167+
VK_KERNEL_FROM_STR(kernel_name),
168+
global_size,
169+
local_size,
170+
// Inputs and Outputs
171+
{
172+
{out, api::MemoryAccessType::WRITE},
173+
{out, api::MemoryAccessType::READ},
174+
{in, api::MemoryAccessType::READ},
175+
},
176+
// Parameter buffers
177+
{graph.create_params_buffer(channel_offset_params)},
178+
// Specialization Constants
179+
{}));
180+
}
181+
}
182+
183+
void add_copy_offset_node(
184+
ComputeGraph& graph,
185+
ValueRef in,
186+
ValueRef range_ref,
187+
ValueRef src_offset_ref,
188+
ValueRef dst_offset_ref,
189+
ValueRef out) {
190+
ivec3 range = api::utils::make_ivec3(*graph.get_int_list(range_ref));
191+
ivec3 src_offset =
192+
api::utils::make_ivec3(*graph.get_int_list(src_offset_ref));
193+
ivec3 dst_offset =
194+
api::utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
195+
196+
add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
197+
}
198+
199+
void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
200+
add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
201+
}
202+
203+
void copy_channel_offset(
204+
ComputeGraph& graph,
205+
const std::vector<ValueRef>& args) {
206+
ValueRef in = args[0];
207+
ValueRef channel_range_ref = args[1];
208+
ValueRef src_channel_offset_ref = args[2];
209+
ValueRef dst_channel_offset_ref = args[3];
210+
ValueRef out = args[4];
211+
212+
auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
213+
auto src_channel_offset =
214+
graph.extract_scalar<int64_t>(src_channel_offset_ref);
215+
auto dst_channel_offset =
216+
graph.extract_scalar<int64_t>(dst_channel_offset_ref);
217+
218+
add_copy_channel_offset_node(
219+
graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
220+
}
221+
222+
REGISTER_OPERATORS {
223+
VK_REGISTER_OP(etvk.copy_offset, copy_offset);
224+
VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
225+
}
226+
70227
} // namespace vkcompute

backends/vulkan/runtime/graph/ops/impl/Copy.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@
1414

1515
namespace vkcompute {
1616

17+
// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
18+
// texture extents specified by the range, src_offset, and dst_offset (all are
19+
// in texture coordinate (x, y, z) from the input image to the output image.
20+
//
21+
// It is possible to have input and output to point to the same image
22+
// object. But when the source range and destination range overlap, the behavior
23+
// is undefined.
1724
void add_copy_offset_node(
1825
ComputeGraph& graph,
1926
const ValueRef in,
@@ -22,4 +29,25 @@ void add_copy_offset_node(
2229
const api::utils::ivec3& dst_offset,
2330
const ValueRef out);
2431

32+
// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
33+
// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
34+
// The range and offset arguments are in the tensor coordinate. It assumes the
35+
// underlying texture is channel-packed.
36+
//
37+
// This function is specialized implementation for copying
38+
// channel packed values. The complication comes from when reading / writing the
39+
// channel dimension on indices that are not aligned to packing, we will need
40+
// be careful about the boundaries.
41+
//
42+
// It achieves the following:
43+
// out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] =
44+
// in [:, src_channel_offset:src_channel_offset + channel_range, :, :]
45+
void add_copy_channel_offset_node(
46+
ComputeGraph& graph,
47+
const ValueRef in,
48+
int32_t channel_range,
49+
int32_t src_channel_offset,
50+
int32_t dst_channel_offset,
51+
const ValueRef out);
52+
2553
} // namespace vkcompute

0 commit comments

Comments
 (0)