Skip to content

Commit ddd8770

Browse files
pytorchbotkirklandsign
authored andcommitted
[ET-VK] Adding repeat support to add_copy_packed_dim_offset_node function. (#9721)
This diff adds support for the repeat operation in `add_copy_packed_dim_offset_node` function in the Vulkan backend for Executorch. The function now takes an additional boolean parameter, "repeat", which indicates whether the copy should wrap around the tensor dimension. `copy_packed_dim_offset` shader now has 2 functions `repeat_copy` and `no_repeat_copy` which are chosen based on specialization constant parameter. `no_repeat_copy` function has the legacy copy code. `repeat_copy` function reads input tensor's dim based on output pos and wraps it according to WHCB repetitions. Push constants `src_offset` and `dst_offset` contains source and destination's WHCB dimensions (and not copy offsets) respectively, when calling repeat function. Differential Revision: [D71477552](https://our.internmc.facebook.com/intern/diff/D71477552/)
1 parent 8969339 commit ddd8770

File tree

3 files changed

+165
-51
lines changed

3 files changed

+165
-51
lines changed

backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl

Lines changed: 110 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,17 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
2020

2121
layout(push_constant) uniform restrict Block {
2222
ivec4 range;
23+
24+
// if not repeating
2325
// xyz is source offset w is channel size
26+
// if repeating
27+
// xyzw is source tensor sizes in WHCB dims respectively
2428
ivec4 src_offset;
29+
30+
// if not repeating
2531
// xyz is destination offset w is channel size
32+
// if repeating
33+
// xyzw is destination tensor sizes in WHCB dims respectively
2634
ivec4 dst_offset;
2735
};
2836

@@ -37,13 +45,9 @@ const lowp int packed_dim = unhash_packed_dim(out_layout);
3745
${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
3846
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
3947

40-
void main() {
41-
const ivec3 pos = ivec3(gl_GlobalInvocationID);
42-
43-
if (any(greaterThanEqual(pos, range.xyz))) {
44-
return;
45-
}
48+
${layout_declare_spec_const(C, "bool", "repeat", "false")}
4649

50+
void no_repeat_copy(ivec3 pos) {
4751
// Position in input tensor
4852
ivec3 in_pos = pos + src_offset.xyz;
4953
in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
@@ -138,3 +142,103 @@ void main() {
138142
out_value,
139143
out_axis_map);
140144
}
145+
146+
void repeat_copy(ivec3 pos) {
147+
// expand position in packed dim
148+
pos[packed_dim] <<= 2;
149+
150+
// channel size aligned by 4 when tensors are channel packed raw value otherwise
151+
const int channel_size = (packed_dim == C_DIM ? alignup4(src_offset.z) : src_offset.z);
152+
153+
// find input texel's WHCB index
154+
const int width_index = pos.x % src_offset.x;
155+
const int height_index = pos.y % src_offset.y;
156+
int channel_index;
157+
int batch_index;
158+
159+
// if tensors are channel packed
160+
if (packed_dim == C_DIM) {
161+
// the output channels in a batch will be channel size * channel repetitions aligned by 4
162+
const int out_channel_size = alignup4(src_offset.z * dst_offset.z);
163+
164+
// batch index in the output
165+
const int out_pos_batch_index = pos.z / out_channel_size;
166+
167+
// source batch index for based on current output pos
168+
batch_index = out_pos_batch_index % src_offset.w;
169+
170+
// batch repetition count for current output pos
171+
const int batch_repetition_index = out_pos_batch_index / src_offset.w;
172+
173+
// calculate input channel index based on current output pos and batch index
174+
// its done this way because we want source channel to restart from zero when a batch index increments
175+
// also batch_index will reset to zero after hitting batch repetition count
176+
// so track the current repetition in batch_repetition_index so it can be used for determining current_index
177+
channel_index = (pos.z - (batch_index + batch_repetition_index * src_offset.w) * out_channel_size) % src_offset.z;
178+
} else {
179+
// the output channels in a batch will be channel size * channel repetitions
180+
const int out_channel_size = src_offset.z * dst_offset.z;
181+
182+
// source batch index for based on current output pos
183+
batch_index = (pos.z / out_channel_size) % src_offset.w;
184+
185+
// source channel index is current output pos wrapped based on channel count
186+
channel_index = pos.z % src_offset.z;
187+
}
188+
189+
// input texel's WCB position
190+
const ivec3 in_pos = ivec3(width_index, height_index, channel_index);
191+
192+
// squeeze position in packed dim
193+
pos[packed_dim] >>= 2;
194+
195+
// packed dim index of texel last fetched
196+
int fetched_in_pos_packed_dim = -1;
197+
198+
// fetched input texel
199+
VEC4_T in_value;
200+
201+
// output texel value
202+
VEC4_T out_value = VEC4_T(0);
203+
204+
int src_lane_offset = in_pos[packed_dim];
205+
206+
for (int i=0; i<4; i++) {
207+
if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
208+
fetched_in_pos_packed_dim = (src_lane_offset >> 2);
209+
210+
ivec3 curr_in_pos = in_pos;
211+
curr_in_pos[packed_dim] = src_lane_offset;
212+
curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
213+
curr_in_pos[packed_dim] >>= 2;
214+
215+
in_value = load_texel_lpos(t_in, curr_in_pos, in_axis_map);
216+
}
217+
218+
out_value[i] = in_value[src_lane_offset & 0x3];
219+
220+
src_lane_offset++;
221+
// if packed index exceeded source packed dim round to zero
222+
src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_offset[packed_dim]);
223+
}
224+
225+
write_texel_lpos(
226+
t_out,
227+
pos,
228+
out_value,
229+
out_axis_map);
230+
}
231+
232+
void main() {
233+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
234+
235+
if (any(greaterThanEqual(pos, range.xyz))) {
236+
return;
237+
}
238+
239+
if (repeat) {
240+
repeat_copy(pos);
241+
} else {
242+
no_repeat_copy(pos);
243+
}
244+
}

backends/vulkan/runtime/graph/ops/impl/Copy.cpp

Lines changed: 51 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -71,61 +71,68 @@ void add_copy_packed_dim_offset_node(
7171
const ivec3& range,
7272
const ivec4& src_offset,
7373
const ivec4& dst_offset,
74-
const ValueRef out) {
74+
const ValueRef out,
75+
bool repeat) {
7576
vTensorPtr t_in = graph.get_tensor(in);
7677
vTensorPtr t_out = graph.get_tensor(out);
7778

78-
// Check the packed dimension is same for both tensors, and if the packed
79-
// dimension is Width or Height. Since the function does not support channel
80-
// packing.
81-
VK_CHECK_COND(
82-
check_same_packed_dim(*t_in, *t_out) &&
83-
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
84-
check_packed_dim_is(*t_in, WHCN::kHeightDim)));
79+
// Check the packed dimension is same for both tensors
80+
VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
81+
if (!repeat) {
82+
// For non repeat copy also check if the packed dimension is Width or
83+
// Height. Since the function does not support channel packing.
84+
VK_CHECK_COND(
85+
check_same_packed_dim(*t_in, *t_out) &&
86+
(check_packed_dim_is(*t_in, WHCN::kWidthDim) ||
87+
check_packed_dim_is(*t_in, WHCN::kHeightDim)));
88+
}
8589

8690
std::string kernel_name = "copy_packed_dim_offset";
8791
kernel_name.reserve(kShaderNameReserve);
8892
add_dtype_suffix(kernel_name, *t_out);
8993

90-
const auto packed_dim = t_in->packed_dim();
9194
// A copy of range with the last element set to batch size of the input tensor
9295
ivec4 final_range = {
9396
range[0], range[1], range[2], dim_at(t_in->sizes(), kBatch4D)};
9497
ivec3 global_wg_size = t_out->logical_limits();
95-
// The starting offset in a texel where this tensor will start copying from
96-
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
97-
// The starting offset in a texel where this tensor will start copying to
98-
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
99-
100-
// The total packed texels this tensor will be copied from
101-
// The first texel of tensor data in packed dimension will be copied from
102-
// remaining lanes from current source Hence (4 - src_lane_offset) is added
103-
// to tensor size in packed dimension
104-
const auto src_packed_size = utils::div_up_4(
105-
(4 - src_lane_offset) +
106-
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
107-
108-
// The total packed texels this tensor will be copied to
109-
// The first texel of tensor data in packed dimension will be copied to
110-
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added to
111-
// tensor size in packed dimension
112-
const auto dst_packed_size = utils::div_up_4(
113-
(4 - dst_lane_offset) +
114-
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
115-
116-
// If the starting src offset is not 0, and the total packed texels is greater
117-
// than the source texel range
118-
const bool has_additional_src_work =
119-
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
120-
// If the starting dst offset is not 0, and the total packed texels is greater
121-
// than the source texel range
122-
const bool has_additional_dst_work =
123-
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
124-
125-
if (has_additional_src_work || has_additional_dst_work) {
126-
global_wg_size[packed_dim]++; // Increase the global work group size in
127-
// packed dimension
128-
final_range[packed_dim]++; // Increase the range in packed dimension
98+
99+
if (!repeat) {
100+
const auto packed_dim = t_in->packed_dim();
101+
// The starting offset in a texel where this tensor will start copying from
102+
const auto src_lane_offset = src_offset[packed_dim] & 0x3;
103+
// The starting offset in a texel where this tensor will start copying to
104+
const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
105+
106+
// The total packed texels this tensor will be copied from
107+
// The first texel of tensor data in packed dimension will be copied from
108+
// remaining lanes from current source Hence (4 - src_lane_offset) is added
109+
// to tensor size in packed dimension
110+
const auto src_packed_size = utils::div_up_4(
111+
(4 - src_lane_offset) +
112+
dim_at(t_out->sizes(), normalize_to_dim_index(*t_out, packed_dim)));
113+
114+
// The total packed texels this tensor will be copied to
115+
// The first texel of tensor data in packed dimension will be copied to
116+
// remaining lanes from previous write Hence (4 - dst_lane_offset) is added
117+
// to tensor size in packed dimension
118+
const auto dst_packed_size = utils::div_up_4(
119+
(4 - dst_lane_offset) +
120+
dim_at(t_in->sizes(), normalize_to_dim_index(*t_in, packed_dim)));
121+
122+
// If the starting src offset is not 0, and the total packed texels is
123+
// greater than the source texel range
124+
const bool has_additional_src_work =
125+
src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
126+
// If the starting dst offset is not 0, and the total packed texels is
127+
// greater than the source texel range
128+
const bool has_additional_dst_work =
129+
dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
130+
131+
if (has_additional_src_work || has_additional_dst_work) {
132+
global_wg_size[packed_dim]++; // Increase the global work group size in
133+
// packed dimension
134+
final_range[packed_dim]++; // Increase the range in packed dimension
135+
}
129136
}
130137

131138
auto shader = VK_KERNEL_FROM_STR(kernel_name);
@@ -144,7 +151,7 @@ void add_copy_packed_dim_offset_node(
144151
// Parameter buffers
145152
{},
146153
// Specialization Constants
147-
{graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
154+
{graph.hashed_layout_of(out), graph.hashed_layout_of(in), repeat},
148155
nullptr,
149156
{},
150157
{

backends/vulkan/runtime/graph/ops/impl/Copy.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,16 @@ void add_copy_offset_node(
5353
// dst_offset (all are in texture coordinate (x, y, z) from the input image to
5454
// the output image.
5555
//
56+
// repeat flag is used to indicate if copy should wrap around tensor dim.
57+
// only true for repeat op.
5658
void add_copy_packed_dim_offset_node(
5759
ComputeGraph& graph,
5860
const ValueRef in,
5961
const utils::ivec3& range,
6062
const utils::ivec4& src_offset,
6163
const utils::ivec4& dst_offset,
62-
const ValueRef out);
64+
const ValueRef out,
65+
bool repeat = false);
6366

6467
// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
6568
// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).

0 commit comments

Comments
 (0)