[ET-VK] Making stride equals dilation the default mode for conv2d dw.

trivedivivek · trivedivivek · commit 1e975b95ce5a · 2025-01-10T12:55:27.000-08:00
Pull Request resolved: #7596 This diff makes changes make stride equals dilation the default mode for conv2d dw output op. Adds a different source file to handle stride not equal dilation case. Differential Revision: [D67979760](https://our.internmc.facebook.com/intern/diff/D67979760/) ghstack-source-id: 260951738
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -14,8 +14,6 @@
 
 #define TILE_SIZE ${TILE_SIZE}
 
-#define STRIDE_EQ_DILATION ${STRIDE_EQ_DILATION}
-
 #define BATCH_SIZE_X ${BATCH_SIZE_X}
 
 #define BATCH_SIZE_Y ${BATCH_SIZE_Y}
@@ -45,7 +43,6 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * output at a single output location.
  */
 
-#if STRIDE_EQ_DILATION
 void main() {
   // x and y are divided by batch size to determine 3d position
   // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
@@ -127,42 +124,3 @@ void main() {
     }
   }
 }
-
-#else
-void main() {
-  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
-  const ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    div_by_x % out_limits.y,
-    div_by_x / out_limits.y);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // Compute the index of the top-left element of the overlay region. Negative
-  // indices indicate that the top-left element is in a region added by padding.
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  // Compute the start and end of the input indices to load. Padding is assumed
-  // to be constant 0 padding, so any reads from the padding region is skipped.
-  const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
-
-  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  int kx = 0;
-  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
-    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
-      // The weight kernel was rearranged such that every NxN filter is
-      // flattened to fit in one row. Each filter was then stacked on top of
-      // each other vertically.
-      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
-      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
-      kx++;
-    }
-  }
-
-  imageStore(t_out, pos, op(sum, out_min, out_max));
-}
-
-#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -12,7 +12,6 @@ conv2d_dw_output_tile:
     TILE_SIZE: 3
     BATCH_SIZE_X: 4
     BATCH_SIZE_Y: 2
-    STRIDE_EQ_DILATION: 0
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -26,15 +25,3 @@ conv2d_dw_output_tile:
     - NAME: conv2d_dw_output_tile_5x5_clamp
       OPERATOR: clamp(X, A, B)
       TILE_SIZE: 5
-    - NAME: conv2d_dw_sed_output_tile_3x3
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_3x3_clamp
-      OPERATOR: clamp(X, A, B)
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_5x5
-      TILE_SIZE: 5
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_5x5_clamp
-      OPERATOR: clamp(X, A, B)
-      TILE_SIZE: 5
-      STRIDE_EQ_DILATION: 1
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#define TILE_SIZE ${TILE_SIZE}
+
+#define op(X, A, B) ${OPERATOR}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
+${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
+${layout_declare_ubo(4, "ivec3", "out_limits")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
+${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
+${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a depthwise convolution. Each shader invocation calculates the
+ * output at a single output location.
+ */
+
+void main() {
+  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
+  const ivec3 pos = ivec3(
+    gl_GlobalInvocationID.x % out_limits.x,
+    div_by_x % out_limits.y,
+    div_by_x / out_limits.y);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * stride - padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so any reads from the padding region is skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + overlay_region.xy;
+
+  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
+  int kx = 0;
+  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
+    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
+      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
+      kx++;
+    }
+  }
+
+  imageStore(t_out, pos, op(sum, out_min, out_max));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_dw_sned_output_tile:
+  parameter_names_with_default_values:
+    OPERATOR: X
+    NDIM: 3
+    DTYPE: float
+    TILE_SIZE: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_dw_sned_output_tile_3x3
+    - NAME: conv2d_dw_sned_output_tile_3x3_clamp
+      OPERATOR: clamp(X, A, B)
+    - NAME: conv2d_dw_sned_output_tile_5x5
+      TILE_SIZE: 5
+    - NAME: conv2d_dw_sned_output_tile_5x5_clamp
+      OPERATOR: clamp(X, A, B)
+      TILE_SIZE: 5
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -134,8 +134,8 @@ vkapi::ShaderInfo get_conv2d_shader(
     case Conv2dMethod::Depthwise:
       kernel_name = "conv2d_dw";
       if (!prepack_weights) {
-        if (stride_equals_dilation) {
-          kernel_name += "_sed";
+        if (!stride_equals_dilation) {
+          kernel_name += "_sned";
         }
         const auto& weight_sizes = graph.get_tref(weight)->sizes;
         if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {