Skip to content

Commit 2d9c489

Browse files
SS-JIAfacebook-github-bot
authored andcommitted
Add CPU/GPU transfer shaders for width and height packing (#2515)
Summary: Pull Request resolved: #2515 ## Context Begin to lay the foundation for `TENSOR_HEIGHT_PACKED` and `TENSOR_WIDTH_PACKED` GPU memory layouts by generating `nchw_to_image` and `image_to_nchw` shader variants for those packing schemes. ghstack-source-id: 219281500 exported-using-ghexport Reviewed By: jorgep31415 Differential Revision: D55018341 fbshipit-source-id: 98e18754be35466adcb3fe2a7fefc549f444c075
1 parent f0864e0 commit 2d9c489

12 files changed

+314
-166
lines changed

backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,21 @@ void main() {
4444

4545
const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
4646
const ivec4 buf_indices =
47-
base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
47+
base_index + ivec4(0, 1, 2, 3) * STRIDE_${PACKING}(cpu_sizes.data);
4848

49-
if (coord.z < cpu_sizes.data.z) {
49+
const int packed_dim_size = PACKED_DIM_${PACKING}(cpu_sizes.data);
50+
int packed_coord = PACKED_DIM_${PACKING}(coord);
51+
52+
if (packed_coord < packed_dim_size) {
5053
buffer_out.data[buf_indices.x] = intex.x;
5154
}
52-
if (coord.z + 1 < cpu_sizes.data.z) {
55+
if (packed_coord + 1 < packed_dim_size) {
5356
buffer_out.data[buf_indices.y] = intex.y;
5457
}
55-
if (coord.z + 2 < cpu_sizes.data.z) {
58+
if (packed_coord + 2 < packed_dim_size) {
5659
buffer_out.data[buf_indices.z] = intex.z;
5760
}
58-
if (coord.z + 3 < cpu_sizes.data.z) {
61+
if (packed_coord + 3 < packed_dim_size) {
5962
buffer_out.data[buf_indices.w] = intex.w;
6063
}
6164
}

backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,21 @@ image_to_nchw:
1010
DTYPE: float
1111
PACKING: CHANNELS_PACKED
1212
generate_variant_forall:
13+
PACKING:
14+
- VALUE: CHANNELS_PACKED
15+
SUFFIX: C_packed
16+
- VALUE: WIDTH_PACKED
17+
SUFFIX: W_packed
18+
- VALUE: HEIGHT_PACKED
19+
SUFFIX: H_packed
1320
DTYPE:
14-
- VALUE: "half"
15-
SUFFIX: "half"
16-
- VALUE: "float"
17-
SUFFIX: "float"
18-
- VALUE: "int"
19-
SUFFIX: "int"
21+
- VALUE: half
22+
SUFFIX: half
23+
- VALUE: float
24+
SUFFIX: float
25+
- VALUE: int
26+
SUFFIX: int
2027
shader_variants:
21-
- NAME: image3d_to_nchw_C_packed
22-
- NAME: image2d_to_nchw_C_packed
28+
- NAME: image3d_to_nchw
29+
- NAME: image2d_to_nchw
2330
NDIM: 2

backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,30 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#define PACKED_DIM_CHANNELS_PACKED(vec) vec.z
10+
11+
#define PACKED_DIM_WIDTH_PACKED(vec) vec.x
12+
13+
#define PACKED_DIM_HEIGHT_PACKED(vec) vec.y
14+
915
#define POS_TO_COORD_CHANNELS_PACKED(pos, sizes) \
1016
ivec4(pos.x, pos.y, (pos.z * 4) % sizes.z, (pos.z * 4) / sizes.z)
1117

18+
#define POS_TO_COORD_WIDTH_PACKED(pos, sizes) \
19+
ivec4((pos.x * 4), pos.y, pos.z % sizes.z, pos.z / sizes.z)
20+
21+
#define POS_TO_COORD_HEIGHT_PACKED(pos, sizes) \
22+
ivec4(pos.x, (pos.y * 4), pos.z % sizes.z, pos.z / sizes.z)
23+
1224
#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
1325
ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)
1426

1527
#define COORD_TO_BUFFER_IDX(coord, sizes) \
1628
coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
1729
coord.w* sizes.z* sizes.y* sizes.x;
30+
31+
#define STRIDE_CHANNELS_PACKED(vec) (vec.x * vec.y)
32+
33+
#define STRIDE_WIDTH_PACKED(vec) (1)
34+
35+
#define STRIDE_HEIGHT_PACKED(vec) (vec.x)

backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ void main() {
4242

4343
const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
4444
const ivec4 buf_indices =
45-
base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
45+
base_index + ivec4(0, 1, 2, 3) * STRIDE_${PACKING}(cpu_sizes.data);
4646

4747
${T[DTYPE]} val_x = buffer_in.data[buf_indices.x];
4848
${T[DTYPE]} val_y = buffer_in.data[buf_indices.y];
@@ -51,10 +51,13 @@ void main() {
5151

5252
${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w);
5353

54-
if (coord.z + 3 >= cpu_sizes.data.z) {
55-
ivec4 c_ind = ivec4(coord.z) + ivec4(0, 1, 2, 3);
56-
${VEC4_T[DTYPE]} valid_c = ${VEC4_T[DTYPE]}(lessThan(c_ind, ivec4(cpu_sizes.data.z)));
57-
texel = texel * valid_c;
54+
const int packed_dim_size = PACKED_DIM_${PACKING}(cpu_sizes.data);
55+
int packed_coord = PACKED_DIM_${PACKING}(coord);
56+
57+
if (packed_coord + 3 >= packed_dim_size) {
58+
ivec4 packed_ind = ivec4(packed_coord) + ivec4(0, 1, 2, 3);
59+
${VEC4_T[DTYPE]} valid_idx = ${VEC4_T[DTYPE]}(lessThan(packed_ind, ivec4(packed_dim_size)));
60+
texel = texel * valid_idx;
5861
}
5962

6063
imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel);

backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,21 @@ nchw_to_image:
1010
DTYPE: float
1111
PACKING: CHANNELS_PACKED
1212
generate_variant_forall:
13+
PACKING:
14+
- VALUE: CHANNELS_PACKED
15+
SUFFIX: C_packed
16+
- VALUE: WIDTH_PACKED
17+
SUFFIX: W_packed
18+
- VALUE: HEIGHT_PACKED
19+
SUFFIX: H_packed
1320
DTYPE:
14-
- VALUE: "half"
15-
SUFFIX: "half"
16-
- VALUE: "float"
17-
SUFFIX: "float"
18-
- VALUE: "int"
19-
SUFFIX: "int"
21+
- VALUE: half
22+
SUFFIX: half
23+
- VALUE: float
24+
SUFFIX: float
25+
- VALUE: int
26+
SUFFIX: int
2027
shader_variants:
21-
- NAME: nchw_to_image3d_C_packed
22-
- NAME: nchw_to_image2d_C_packed
28+
- NAME: nchw_to_image3d
29+
- NAME: nchw_to_image2d
2330
NDIM: 2

backends/vulkan/test/glsl/all_shaders.yaml

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,30 +33,23 @@ fill_texture__test:
3333
shader_variants:
3434
- NAME: fill_texture__test
3535

36-
image_to_nchw__test:
36+
idx_fill_texture:
3737
parameter_names_with_default_values:
38-
NDIM: 3
3938
DTYPE: float
40-
PACKING: CHANNELS_PACKED
41-
generate_variant_forall:
42-
DTYPE:
43-
- VALUE: "half"
44-
SUFFIX: "half"
45-
- VALUE: "float"
46-
SUFFIX: "float"
47-
shader_variants:
48-
- NAME: image3d_to_nchw__test_C_packed
49-
50-
nchw_to_image__test:
51-
parameter_names_with_default_values:
5239
NDIM: 3
53-
DTYPE: float
5440
PACKING: CHANNELS_PACKED
5541
generate_variant_forall:
42+
PACKING:
43+
- VALUE: "CHANNELS_PACKED"
44+
SUFFIX: "C_packed"
45+
- VALUE: "WIDTH_PACKED"
46+
SUFFIX: "W_packed"
47+
- VALUE: "HEIGHT_PACKED"
48+
SUFFIX: "H_packed"
5649
DTYPE:
5750
- VALUE: "half"
5851
SUFFIX: "half"
5952
- VALUE: "float"
6053
SUFFIX: "float"
6154
shader_variants:
62-
- NAME: nchw_to_image3d__test_C_packed
55+
- NAME: idx_fill_texture

backends/vulkan/test/glsl/nchw_to_image__test.glsl renamed to backends/vulkan/test/glsl/idx_fill_texture.glsl

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,21 @@
77
*/
88

99
#version 450 core
10-
// clang-format off
10+
1111
#define PRECISION ${PRECISION}
12-
// clang-format on
1312

1413
#include "indexing_utils.h"
1514

1615
layout(std430) buffer;
1716

18-
// clang-format off
1917
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
20-
// clang-format on
21-
layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
22-
${T[DTYPE]} data[];
23-
}
24-
buffer_in;
2518

26-
layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
19+
layout(set = 0, binding = 1) uniform PRECISION restrict GpuSizes {
2720
ivec4 data;
2821
}
2922
gpu_sizes;
3023

31-
layout(set = 0, binding = 3) uniform PRECISION restrict CpuSizes {
24+
layout(set = 0, binding = 2) uniform PRECISION restrict CpuSizes {
3225
ivec4 data;
3326
}
3427
cpu_sizes;
@@ -45,20 +38,9 @@ void main() {
4538

4639
const int base_index = COORD_TO_BUFFER_IDX(coord, cpu_sizes.data);
4740
const ivec4 buf_indices =
48-
base_index + ivec4(0, 1, 2, 3) * (gpu_sizes.data.x * gpu_sizes.data.y);
49-
50-
${T[DTYPE]} val_x = buffer_in.data[buf_indices.x];
51-
${T[DTYPE]} val_y = buffer_in.data[buf_indices.y];
52-
${T[DTYPE]} val_z = buffer_in.data[buf_indices.z];
53-
${T[DTYPE]} val_w = buffer_in.data[buf_indices.w];
41+
base_index + ivec4(0, 1, 2, 3) * PLANE_SIZE_${PACKING}(gpu_sizes.data);
5442

55-
${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w);
56-
57-
if (coord.z + 3 >= cpu_sizes.data.z) {
58-
ivec4 c_ind = ivec4(coord.z) + ivec4(0, 1, 2, 3);
59-
vec4 valid_c = vec4(lessThan(c_ind, ivec4(cpu_sizes.data.z)));
60-
texel = texel * valid_c;
61-
}
43+
${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(buf_indices);
6244

6345
imageStore(image_out, ${GET_POS[NDIM]("pos")}, texel);
6446
}

backends/vulkan/test/glsl/image_to_nchw__test.glsl

Lines changed: 0 additions & 62 deletions
This file was deleted.

backends/vulkan/test/glsl/indexing_utils.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,27 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#define PACKED_DIM_CHANNELS_PACKED(vec) vec.z
10+
11+
#define PACKED_DIM_WIDTH_PACKED(vec) vec.x
12+
13+
#define PACKED_DIM_HEIGHT_PACKED(vec) vec.y
14+
915
#define POS_TO_COORD_CHANNELS_PACKED(pos, sizes) \
1016
ivec4(pos.x, pos.y, (pos.z * 4) % sizes.z, (pos.z * 4) / sizes.z)
1117

18+
#define POS_TO_COORD_WIDTH_PACKED(pos, sizes) \
19+
ivec4((pos.x * 4), pos.y, pos.z % sizes.z, pos.z / sizes.z)
20+
21+
#define POS_TO_COORD_HEIGHT_PACKED(pos, sizes) \
22+
ivec4(pos.x, (pos.y * 4), pos.z % sizes.z, pos.z / sizes.z)
23+
1224
#define COORD_TO_BUFFER_IDX(coord, sizes) \
1325
coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
1426
coord.w* sizes.z* sizes.y* sizes.x;
27+
28+
#define PLANE_SIZE_CHANNELS_PACKED(vec) (vec.x * vec.y)
29+
30+
#define PLANE_SIZE_WIDTH_PACKED(vec) (1)
31+
32+
#define PLANE_SIZE_HEIGHT_PACKED(vec) (vec.x)

backends/vulkan/test/utils/test_utils.cpp

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,9 @@ void record_nchw_to_image_op(
7373
api::VulkanBuffer& src_buffer,
7474
vTensor& v_dst) {
7575
api::PipelineBarrier pipeline_barrier{};
76-
api::ShaderInfo compute_shader =
77-
VK_KERNEL(nchw_to_image3d__test_C_packed_half);
78-
if (v_dst.image().format() == VK_FORMAT_R32G32B32A32_SFLOAT) {
79-
compute_shader = VK_KERNEL(nchw_to_image3d__test_C_packed_float);
80-
}
76+
8177
context->submit_compute_job(
82-
compute_shader,
78+
get_nchw_to_image_shader(v_dst),
8379
pipeline_barrier,
8480
v_dst.virtual_extents(),
8581
adaptive_work_group_size(v_dst.virtual_extents()),
@@ -97,14 +93,9 @@ void record_image_to_nchw_op(
9793
api::Context* const context,
9894
vTensor& v_src,
9995
api::VulkanBuffer& dst_buffer) {
100-
api::ShaderInfo compute_shader =
101-
VK_KERNEL(image3d_to_nchw__test_C_packed_half);
102-
if (v_src.image().format() == VK_FORMAT_R32G32B32A32_SFLOAT) {
103-
compute_shader = VK_KERNEL(image3d_to_nchw__test_C_packed_float);
104-
}
10596
api::PipelineBarrier pipeline_barrier{};
10697
context->submit_compute_job(
107-
compute_shader,
98+
get_image_to_nchw_shader(v_src),
10899
pipeline_barrier,
109100
v_src.virtual_extents(),
110101
adaptive_work_group_size(v_src.virtual_extents()),
@@ -147,12 +138,6 @@ void execute_and_check_add(
147138
vTensor& c,
148139
float a_val,
149140
float b_val) {
150-
// Add shader kernel
151-
api::ShaderInfo kernel = VK_KERNEL(binary_add_nobroadcast__test_half);
152-
if (c.image().format() == VK_FORMAT_R32G32B32A32_SFLOAT) {
153-
kernel = VK_KERNEL(nchw_to_image3d__test_C_packed_float);
154-
}
155-
156141
// Fill input tensors
157142
fill_vtensor(a, a_val);
158143
fill_vtensor(b, b_val);
@@ -164,8 +149,8 @@ void execute_and_check_add(
164149
std::vector<float> data_out = extract_vtensor(c);
165150

166151
// Check output
167-
for (const auto& d : data_out) {
168-
EXPECT_TRUE(d == (a_val + b_val));
152+
for (size_t i = 0; i < data_out.size(); ++i) {
153+
CHECK_VALUE(data_out, i, (a_val + b_val));
169154
}
170155
}
171156

0 commit comments

Comments
 (0)