Migrate conv2d shaders to new layout API (#4053)

jorgep31415 · facebook-github-bot · commit 34fd76700698 · 2024-06-24T14:19:55.000-07:00
Summary: Pull Request resolved: #4053 TSIA ghstack-source-id: 231418296 Reviewed By: SS-JIA Differential Revision: D58837234 fbshipit-source-id: b0874f6d9f52baf2fed04318f6a95e058cf849d1
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -18,36 +18,15 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
-layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
-
-layout(set = 0, binding = 6) uniform PRECISION restrict Params {
-  ivec2 kernel_size;
-  ivec2 stride;
-  ivec2 padding;
-  ivec2 dilation;
-};
-
-// If fields are separated, SwiftShader cannot identify in_group_size.
-layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
-  ivec2 overlay_region;
-  int in_group_size;
-};
-
-layout(set = 0, binding = 8) uniform PRECISION restrict OutputParams {
-  float out_min;
-  float out_max;
-};
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
+${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
+${layout_declare_ubo(4, "ivec3", "out_limits")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
+${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
+${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -83,18 +62,18 @@ void main() {
   kstart.y += pos.z * kernel_size.y;
 
   // Perform the convolution by iterating over the overlay region.
-  VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   const int ic4 = in_group_size / 4;
   for (int z4 = 0; z4 < ic4; ++z4, kstart.x += kernel_size.x * 4) {
     for (int y = start.y, ky = kstart.y; y < end.y; y += dilation.y, ++ky) {
       for (int x = start.x, kx = kstart.x; x < end.x; x += dilation.x, kx += 4) {
-        const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, z4), 0);
+        const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, z4), 0);
         const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
 
         // To explain the calculation below, the contents of in_texel and the
-        // group of 4 texels loaded from kernel_in are shown:
+        // group of 4 texels loaded from t_kernel are shown:
         //
-        //   in_texel               kernel_in
+        //   in_texel               t_kernel
         //    -x->                   ---x--->
         //   +---+              +----+----+----+----+
         // ^ | w |           ^  | D0 | D1 | D2 | D3 |
@@ -106,7 +85,7 @@ void main() {
         //   | x |              | A0 | A1 | A2 | A3 |
         //   +---+              +----+----+----+----+
         //
-        // In the kernel_in graphic, cells sharing the same letter are from
+        // In the t_kernel graphic, cells sharing the same letter are from
         // the same batch/output channel index, and the number denotes a unique
         // channel index. To calculate the output texel, the following
         // calculation is performed:
@@ -123,13 +102,13 @@ void main() {
         //
         // which is expressed in the following statements.
 
-        sum = fma(in_texel.xxxx, texelFetch(kernel_in, ivec2(kxs.x, ky), 0), sum);
-        sum = fma(in_texel.yyyy, texelFetch(kernel_in, ivec2(kxs.y, ky), 0), sum);
-        sum = fma(in_texel.zzzz, texelFetch(kernel_in, ivec2(kxs.z, ky), 0), sum);
-        sum = fma(in_texel.wwww, texelFetch(kernel_in, ivec2(kxs.w, ky), 0), sum);
+        sum = fma(in_texel.xxxx, texelFetch(t_kernel, ivec2(kxs.x, ky), 0), sum);
+        sum = fma(in_texel.yyyy, texelFetch(t_kernel, ivec2(kxs.y, ky), 0), sum);
+        sum = fma(in_texel.zzzz, texelFetch(t_kernel, ivec2(kxs.z, ky), 0), sum);
+        sum = fma(in_texel.wwww, texelFetch(t_kernel, ivec2(kxs.w, ky), 0), sum);
       }
     }
   }
 
-  imageStore(image_out, pos, op(sum, out_min, out_max));
+  imageStore(t_out, pos, op(sum, out_min, out_max));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -18,36 +18,15 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
-layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
-
-layout(set = 0, binding = 6) uniform PRECISION restrict Params {
-  ivec2 kernel_size;
-  ivec2 stride;
-  ivec2 padding;
-  ivec2 dilation;
-};
-
-// If fields are separated, SwiftShader cannot identify in_group_size.
-layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
-  ivec2 overlay_region;
-  int in_group_size;
-};
-
-layout(set = 0, binding = 8) uniform PRECISION restrict OutputParams {
-  float out_min;
-  float out_max;
-};
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
+${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
+${layout_declare_ubo(4, "ivec3", "out_limits")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
+${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
+${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -71,18 +50,18 @@ void main() {
   const ivec2 start = ipos;
   const ivec2 end = ipos + overlay_region.xy;
 
-  VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   int kx = 0;
   for (int y = start.y; y < end.y; y += dilation.y) {
     for (int x = start.x; x < end.x; x += dilation.x) {
       // The weight kernel was rearranged such that every NxN filter is
       // flattened to fit in one row. Each filter was then stacked on top of
       // each other vertically.
-      const VEC4_T in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
-      sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
+      const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
       ++kx;
     }
   }
 
-  imageStore(image_out, pos, op(sum, out_min, out_max));
+  imageStore(t_out, pos, op(sum, out_min, out_max));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -20,36 +20,15 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
-layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
-
-layout(set = 0, binding = 6) uniform PRECISION restrict Params {
-  ivec2 kernel_size;
-  ivec2 stride;
-  ivec2 padding;
-  ivec2 dilation;
-};
-
-// If fields are separated, SwiftShader cannot identify in_group_size.
-layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
-  ivec2 overlay_region;
-  int in_group_size;
-};
-
-layout(set = 0, binding = 8) uniform PRECISION restrict OutputParams {
-  float out_min;
-  float out_max;
-};
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
+${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
+${layout_declare_ubo(4, "ivec3", "out_limits")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
+${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
+${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -73,18 +52,18 @@ void main() {
   const ivec2 start = ipos;
   const ivec2 end = ipos + overlay_region.xy;
 
-  VEC4_T sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   int kx = 0;
   for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
     for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
       // The weight kernel was rearranged such that every NxN filter is
       // flattened to fit in one row. Each filter was then stacked on top of
       // each other vertically.
-      const vec4 in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
-      sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
+      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
       kx++;
     }
   }
 
-  imageStore(image_out, pos, op(sum, out_min, out_max));
+  imageStore(t_out, pos, op(sum, out_min, out_max));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -20,43 +20,22 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
-layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
-
-layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
-  ivec4 data;
-};
-
-layout(set = 0, binding = 6) uniform PRECISION restrict Params {
-  ivec2 kernel_size;
-  ivec2 stride;
-  ivec2 padding;
-  ivec2 dilation;
-};
-
-// If fields are separated, SwiftShader cannot identify in_group_size.
-layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
-  ivec2 overlay_region;
-  int in_group_size;
-};
-
-layout(set = 0, binding = 8) uniform PRECISION restrict OutputParams {
-  float out_min;
-  float out_max;
-};
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
+${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
+${layout_declare_ubo(4, "ivec3", "out_limits")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
+${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
+${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
- * size is only 1x1, making it easier to re-use loaded texels from kernel_in.
+ * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
  */
 void main() {
   const ivec3 gpos = ivec3(gl_GlobalInvocationID);
@@ -91,7 +70,7 @@ void main() {
   }
 
   vec4 sum[TILE_SIZE * TILE_SIZE];
-  sum[0] = texelFetch(bias_in, ivec2(gpos.z, 0), 0);
+  sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
   for (int i = 1; i < TILE_SIZE * TILE_SIZE; ++i) {
     sum[i] = sum[0];
   }
@@ -102,21 +81,21 @@ void main() {
     // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
     // the z-axis.
     vec4 in_tex[TILE_SIZE * TILE_SIZE];
-    const vec4 ktex_0 = texelFetch(kernel_in, ivec2(z + 0, gpos.z), 0);
-    const vec4 ktex_1 = texelFetch(kernel_in, ivec2(z + 1, gpos.z), 0);
-    const vec4 ktex_2 = texelFetch(kernel_in, ivec2(z + 2, gpos.z), 0);
-    const vec4 ktex_3 = texelFetch(kernel_in, ivec2(z + 3, gpos.z), 0);
+    const vec4 ktex_0 = texelFetch(t_kernel, ivec2(z + 0, gpos.z), 0);
+    const vec4 ktex_1 = texelFetch(t_kernel, ivec2(z + 1, gpos.z), 0);
+    const vec4 ktex_2 = texelFetch(t_kernel, ivec2(z + 2, gpos.z), 0);
+    const vec4 ktex_3 = texelFetch(t_kernel, ivec2(z + 3, gpos.z), 0);
 
     for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
-      in_tex[i] = texelFetch(image_in, ivec3(ipos[i], z4), 0);
+      in_tex[i] = texelFetch(t_in, ivec3(ipos[i], z4), 0);
     }
 
     for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
       // For 2x2 tile size algorithm works as follows.
       // To explain the calculations below, the contents of one in_tex and the
-      // group of 4 texels loaded from kernel_in are shown:
+      // group of 4 texels loaded from t_kernel are shown:
       //
-      //   in_tex                 kernel_in
+      //   in_tex                 t_kernel
       //    -x->                   ---x--->
       //   +---+              +----+----+----+----+
       // ^ | w |           ^  | D0 | D1 | D2 | D3 |
@@ -128,7 +107,7 @@ void main() {
       //   | x |              | A0 | A1 | A2 | A3 |
       //   +---+              +----+----+----+----+
       //
-      // In the kernel_in graphic, cells sharing the same letter are from
+      // In the t_kernel graphic, cells sharing the same letter are from
       // the same batch/output channel index, and the number denotes a unique
       // channel index. To calculate the output texel, the following
       // calculation is performed:
@@ -154,7 +133,7 @@ void main() {
 
   for (int i = 0; i < TILE_SIZE * TILE_SIZE; ++i) {
     if (all(lessThan(pos[i], out_limits))) {
-      imageStore(image_out, pos[i], op(sum[i], out_min, out_max));
+      imageStore(t_out, pos[i], op(sum[i], out_min, out_max));
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl