Skip to content

Commit 34fd767

Browse files
jorgep31415facebook-github-bot
authored andcommitted
Migrate conv2d shaders to new layout API (#4053)
Summary: Pull Request resolved: #4053 TSIA ghstack-source-id: 231418296 Reviewed By: SS-JIA Differential Revision: D58837234 fbshipit-source-id: b0874f6d9f52baf2fed04318f6a95e058cf849d1
1 parent ae175c5 commit 34fd767

File tree

5 files changed

+81
-186
lines changed

5 files changed

+81
-186
lines changed

backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl

Lines changed: 19 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -18,36 +18,15 @@
1818

1919
layout(std430) buffer;
2020

21-
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
22-
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
23-
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
24-
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
25-
26-
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
27-
ivec3 out_limits;
28-
};
29-
30-
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
31-
ivec4 in_sizes;
32-
};
33-
34-
layout(set = 0, binding = 6) uniform PRECISION restrict Params {
35-
ivec2 kernel_size;
36-
ivec2 stride;
37-
ivec2 padding;
38-
ivec2 dilation;
39-
};
40-
41-
// If fields are separated, SwiftShader cannot identify in_group_size.
42-
layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
43-
ivec2 overlay_region;
44-
int in_group_size;
45-
};
46-
47-
layout(set = 0, binding = 8) uniform PRECISION restrict OutputParams {
48-
float out_min;
49-
float out_max;
50-
};
21+
${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
22+
${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
23+
${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
24+
${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
25+
${layout_declare_ubo(4, "ivec3", "out_limits")}
26+
${layout_declare_ubo(5, "ivec4", "in_sizes")}
27+
${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
28+
${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
29+
${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
5130

5231
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
5332

@@ -83,18 +62,18 @@ void main() {
8362
kstart.y += pos.z * kernel_size.y;
8463

8564
// Perform the convolution by iterating over the overlay region.
86-
VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
65+
VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
8766
const int ic4 = in_group_size / 4;
8867
for (int z4 = 0; z4 < ic4; ++z4, kstart.x += kernel_size.x * 4) {
8968
for (int y = start.y, ky = kstart.y; y < end.y; y += dilation.y, ++ky) {
9069
for (int x = start.x, kx = kstart.x; x < end.x; x += dilation.x, kx += 4) {
91-
const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, z4), 0);
70+
const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, z4), 0);
9271
const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
9372

9473
// To explain the calculation below, the contents of in_texel and the
95-
// group of 4 texels loaded from kernel_in are shown:
74+
// group of 4 texels loaded from t_kernel are shown:
9675
//
97-
// in_texel kernel_in
76+
// in_texel t_kernel
9877
// -x-> ---x--->
9978
// +---+ +----+----+----+----+
10079
// ^ | w | ^ | D0 | D1 | D2 | D3 |
@@ -106,7 +85,7 @@ void main() {
10685
// | x | | A0 | A1 | A2 | A3 |
10786
// +---+ +----+----+----+----+
10887
//
109-
// In the kernel_in graphic, cells sharing the same letter are from
88+
// In the t_kernel graphic, cells sharing the same letter are from
11089
// the same batch/output channel index, and the number denotes a unique
11190
// channel index. To calculate the output texel, the following
11291
// calculation is performed:
@@ -123,13 +102,13 @@ void main() {
123102
//
124103
// which is expressed in the following statements.
125104

126-
sum = fma(in_texel.xxxx, texelFetch(kernel_in, ivec2(kxs.x, ky), 0), sum);
127-
sum = fma(in_texel.yyyy, texelFetch(kernel_in, ivec2(kxs.y, ky), 0), sum);
128-
sum = fma(in_texel.zzzz, texelFetch(kernel_in, ivec2(kxs.z, ky), 0), sum);
129-
sum = fma(in_texel.wwww, texelFetch(kernel_in, ivec2(kxs.w, ky), 0), sum);
105+
sum = fma(in_texel.xxxx, texelFetch(t_kernel, ivec2(kxs.x, ky), 0), sum);
106+
sum = fma(in_texel.yyyy, texelFetch(t_kernel, ivec2(kxs.y, ky), 0), sum);
107+
sum = fma(in_texel.zzzz, texelFetch(t_kernel, ivec2(kxs.z, ky), 0), sum);
108+
sum = fma(in_texel.wwww, texelFetch(t_kernel, ivec2(kxs.w, ky), 0), sum);
130109
}
131110
}
132111
}
133112

134-
imageStore(image_out, pos, op(sum, out_min, out_max));
113+
imageStore(t_out, pos, op(sum, out_min, out_max));
135114
}

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl

Lines changed: 13 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -18,36 +18,15 @@
1818

1919
layout(std430) buffer;
2020

21-
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
22-
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
23-
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
24-
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
25-
26-
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
27-
ivec3 out_limits;
28-
};
29-
30-
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
31-
ivec4 in_sizes;
32-
};
33-
34-
layout(set = 0, binding = 6) uniform PRECISION restrict Params {
35-
ivec2 kernel_size;
36-
ivec2 stride;
37-
ivec2 padding;
38-
ivec2 dilation;
39-
};
40-
41-
// If fields are separated, SwiftShader cannot identify in_group_size.
42-
layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
43-
ivec2 overlay_region;
44-
int in_group_size;
45-
};
46-
47-
layout(set = 0, binding = 8) uniform PRECISION restrict OutputParams {
48-
float out_min;
49-
float out_max;
50-
};
21+
${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
22+
${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
23+
${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
24+
${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
25+
${layout_declare_ubo(4, "ivec3", "out_limits")}
26+
${layout_declare_ubo(5, "ivec4", "in_sizes")}
27+
${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
28+
${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
29+
${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
5130

5231
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
5332

@@ -71,18 +50,18 @@ void main() {
7150
const ivec2 start = ipos;
7251
const ivec2 end = ipos + overlay_region.xy;
7352

74-
VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
53+
VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
7554
int kx = 0;
7655
for (int y = start.y; y < end.y; y += dilation.y) {
7756
for (int x = start.x; x < end.x; x += dilation.x) {
7857
// The weight kernel was rearranged such that every NxN filter is
7958
// flattened to fit in one row. Each filter was then stacked on top of
8059
// each other vertically.
81-
const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
82-
sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
60+
const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
61+
sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
8362
++kx;
8463
}
8564
}
8665

87-
imageStore(image_out, pos, op(sum, out_min, out_max));
66+
imageStore(t_out, pos, op(sum, out_min, out_max));
8867
}

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 13 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -20,36 +20,15 @@
2020

2121
layout(std430) buffer;
2222

23-
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
24-
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
25-
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
26-
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
27-
28-
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
29-
ivec3 out_limits;
30-
};
31-
32-
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
33-
ivec4 in_sizes;
34-
};
35-
36-
layout(set = 0, binding = 6) uniform PRECISION restrict Params {
37-
ivec2 kernel_size;
38-
ivec2 stride;
39-
ivec2 padding;
40-
ivec2 dilation;
41-
};
42-
43-
// If fields are separated, SwiftShader cannot identify in_group_size.
44-
layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
45-
ivec2 overlay_region;
46-
int in_group_size;
47-
};
48-
49-
layout(set = 0, binding = 8) uniform PRECISION restrict OutputParams {
50-
float out_min;
51-
float out_max;
52-
};
23+
${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
24+
${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
25+
${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
26+
${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
27+
${layout_declare_ubo(4, "ivec3", "out_limits")}
28+
${layout_declare_ubo(5, "ivec4", "in_sizes")}
29+
${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
30+
${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
31+
${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
5332

5433
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
5534

@@ -73,18 +52,18 @@ void main() {
7352
const ivec2 start = ipos;
7453
const ivec2 end = ipos + overlay_region.xy;
7554

76-
VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
55+
VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
7756
int kx = 0;
7857
for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
7958
for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
8059
// The weight kernel was rearranged such that every NxN filter is
8160
// flattened to fit in one row. Each filter was then stacked on top of
8261
// each other vertically.
83-
const vec4 in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
84-
sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
62+
const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
63+
sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
8564
kx++;
8665
}
8766
}
8867

89-
imageStore(image_out, pos, op(sum, out_min, out_max));
68+
imageStore(t_out, pos, op(sum, out_min, out_max));
9069
}

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 20 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -20,43 +20,22 @@
2020

2121
layout(std430) buffer;
2222

23-
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
24-
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
25-
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
26-
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
27-
28-
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
29-
ivec3 out_limits;
30-
};
31-
32-
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
33-
ivec4 data;
34-
};
35-
36-
layout(set = 0, binding = 6) uniform PRECISION restrict Params {
37-
ivec2 kernel_size;
38-
ivec2 stride;
39-
ivec2 padding;
40-
ivec2 dilation;
41-
};
42-
43-
// If fields are separated, SwiftShader cannot identify in_group_size.
44-
layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
45-
ivec2 overlay_region;
46-
int in_group_size;
47-
};
48-
49-
layout(set = 0, binding = 8) uniform PRECISION restrict OutputParams {
50-
float out_min;
51-
float out_max;
52-
};
23+
${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
24+
${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
25+
${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
26+
${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
27+
${layout_declare_ubo(4, "ivec3", "out_limits")}
28+
${layout_declare_ubo(5, "ivec4", "in_sizes")}
29+
${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
30+
${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
31+
${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
5332

5433
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
5534

5635
/*
5736
* Computes a 2D pointwise convolution of an NxN output tile. Calculating an
5837
* output tile for pointwise convolution is more efficient because the kernel
59-
* size is only 1x1, making it easier to re-use loaded texels from kernel_in.
38+
* size is only 1x1, making it easier to re-use loaded texels from t_kernel.
6039
*/
6140
void main() {
6241
const ivec3 gpos = ivec3(gl_GlobalInvocationID);
@@ -91,7 +70,7 @@ void main() {
9170
}
9271

9372
vec4 sum[TILE_SIZE * TILE_SIZE];
94-
sum[0] = texelFetch(bias_in, ivec2(gpos.z, 0), 0);
73+
sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
9574
for (int i = 1; i < TILE_SIZE * TILE_SIZE; ++i) {
9675
sum[i] = sum[0];
9776
}
@@ -102,21 +81,21 @@ void main() {
10281
// channel (IC) dim is along the x-axis, and the batch (OC) dim is along
10382
// the z-axis.
10483
vec4 in_tex[TILE_SIZE * TILE_SIZE];
105-
const vec4 ktex_0 = texelFetch(kernel_in, ivec2(z + 0, gpos.z), 0);
106-
const vec4 ktex_1 = texelFetch(kernel_in, ivec2(z + 1, gpos.z), 0);
107-
const vec4 ktex_2 = texelFetch(kernel_in, ivec2(z + 2, gpos.z), 0);
108-
const vec4 ktex_3 = texelFetch(kernel_in, ivec2(z + 3, gpos.z), 0);
84+
const vec4 ktex_0 = texelFetch(t_kernel, ivec2(z + 0, gpos.z), 0);
85+
const vec4 ktex_1 = texelFetch(t_kernel, ivec2(z + 1, gpos.z), 0);
86+
const vec4 ktex_2 = texelFetch(t_kernel, ivec2(z + 2, gpos.z), 0);
87+
const vec4 ktex_3 = texelFetch(t_kernel, ivec2(z + 3, gpos.z), 0);
10988

11089
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
111-
in_tex[i] = texelFetch(image_in, ivec3(ipos[i], z4), 0);
90+
in_tex[i] = texelFetch(t_in, ivec3(ipos[i], z4), 0);
11291
}
11392

11493
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
11594
// For 2x2 tile size algorithm works as follows.
11695
// To explain the calculations below, the contents of one in_tex and the
117-
// group of 4 texels loaded from kernel_in are shown:
96+
// group of 4 texels loaded from t_kernel are shown:
11897
//
119-
// in_tex kernel_in
98+
// in_tex t_kernel
12099
// -x-> ---x--->
121100
// +---+ +----+----+----+----+
122101
// ^ | w | ^ | D0 | D1 | D2 | D3 |
@@ -128,7 +107,7 @@ void main() {
128107
// | x | | A0 | A1 | A2 | A3 |
129108
// +---+ +----+----+----+----+
130109
//
131-
// In the kernel_in graphic, cells sharing the same letter are from
110+
// In the t_kernel graphic, cells sharing the same letter are from
132111
// the same batch/output channel index, and the number denotes a unique
133112
// channel index. To calculate the output texel, the following
134113
// calculation is performed:
@@ -154,7 +133,7 @@ void main() {
154133

155134
for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
156135
if (all(lessThan(pos[i], out_limits))) {
157-
imageStore(image_out, pos[i], op(sum[i], out_min, out_max));
136+
imageStore(t_out, pos[i], op(sum[i], out_min, out_max));
158137
}
159138
}
160139
}

0 commit comments

Comments
 (0)