conv1d general case

copyrightly · facebook-github-bot · commit 1f8210b45fc3 · 2024-04-22T16:39:49.000-07:00
Summary: We port jorgep31415's work of conv1d for lite interpreter into ET. The current implementation supports general batch_size, weight_size, stride, padding, dilation and groups.

Differential Revision: D56380147
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
@@ -21,78 +21,112 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D kernel_in;
 layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in;
 
-layout(set = 0, binding = 4) uniform PRECISION restrict Out_channels {
-  int data;
-}
-out_channels;
+layout(set = 0, binding = 4) uniform PRECISION restrict In_length {
+  int in_length;
+};
 
-layout(set = 0, binding = 5) uniform PRECISION restrict In_length {
-  int data;
-}
-in_length;
+layout(set = 0, binding = 5) uniform PRECISION restrict Kernel_size {
+  int kernel_size;
+};
 
-layout(set = 0, binding = 6) uniform PRECISION restrict Kernel_size {
-  int data;
-}
-kernel_size;
+layout(set = 0, binding = 6) uniform PRECISION restrict Stride {
+  int stride;
+};
 
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+layout(set = 0, binding = 7) uniform PRECISION restrict Padding {
+  int padding;
+};
 
-/*
- * This implementation optimize for simplicity (and partially performance) for a
- * (1, C, L) where C == groups. Hence we only focus on calculating the rolling
- * kernel of the L dimension.
- */
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+layout(set = 0, binding = 8) uniform PRECISION restrict Dilation {
+  int dilation;
+};
 
-  // The global workgroup should have taken care of it. We only perform one
-  // work item for each 1d tensor on lengths
-  if (pos.x >= 1) {
-    return;
-  }
+layout(set = 0, binding = 9) uniform PRECISION restrict In_group_size {
+  int in_group_size;
+};
 
-  int c = pos.y;
-  if (c >= out_channels.data) {
-    return;
-  }
+layout(set = 0, binding = 10) uniform PRECISION restrict Out_group_size {
+  int out_group_size;
+};
 
-  // Assume n = 1, do not handle n > 1 case for now.
-  int n = pos.z;
-  if (n >= 1) {
-    return;
-  }
+layout(set = 0, binding = 11) uniform PRECISION restrict Batch_size {
+  int batch_size;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-  vec4 bias = texelFetch(bias_in, ivec3(c, 0, 0), 0);
-
-  for (int i = 0; i < in_length.data - kernel_size.data + 1; ++i) {
-    vec4 v = vec4(0);
-    for (int k = 0; k < kernel_size.data; ++k) {
-      const ivec3 in_pos = ivec3(i+k, c, 0);
-      const vec4 input_value = texelFetch(image_in, in_pos, 0);
-
-      // Note that we are reading weight in the inner loop, this could be
-      // improved by moving it before the outer loop. Since the weight vector is
-      // contant for the entire call.
-
-      // weight in input-space: (c, 0, k);
-      // notice that c is 4-packed. We need to mod 4 to get the actual weight.
-      const ivec3 w_pos = ivec3(k, 0, c / 4);
-      const vec4 weight = texelFetch(kernel_in, w_pos, 0);
-
-      float w = weight.x;
-      if (c % 4 == 1) {
-        w = weight.y;
-      } else if (c % 4 == 2) {
-        w = weight.z;
-      } else if (c % 4 == 3) {
-        w = weight.w;
+// Let us define
+//
+// input = (N, in_C, in_L),
+// output = (N, out_C, out_L),
+// groups = G,
+// kernel = K,
+//
+// which results in shapes
+//
+// weight = (out_C, in_C / G, K),
+// bias = (out_C,).
+//
+// This implementation performs out_C shader invocations, where each invocation
+// calculates the rolling kernel of the length dimension for each batch, i.e.,
+// computes out_L * N results.
+//
+// Note that we can rewrite this implementation as out_L * out_C * ceil(N / 4)
+// shader invocations, where each invocation computes 1 result. But that
+// performs worse.
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // "out_c" is the output's channel index where we write our result.
+  // Across shader invocations, this is the only value that varies.
+  int out_c = pos.y;
+  vec4 bias = texelFetch(bias_in, ivec3(out_c, 0, 0), 0);
+
+  // "in_c" tracks the input's channel start index.
+  // We iterate over the input group that corresponds to the output group.
+  int c_start = (out_c / out_group_size) * in_group_size;
+  int c_end = c_start + in_group_size;
+
+  // "in_l" tracks the input's length start index for our input-kernel overlay
+  // region.
+  int l_start = -padding;
+  int l_end = in_length + padding - dilation * (kernel_size - 1);
+
+  // Since the input/output tensors are channel-packed, which is along the
+  // batch dimension, we can batch-read/write four elements at a time.
+  for (int n = 0; n < batch_size; n += 4) {
+    // "out_l" tracks the output's length index where we write our result.
+    int out_l = 0;
+
+    for (int in_l = l_start; in_l < l_end; in_l += stride, ++out_l) {
+      vec4 sum = vec4(0);
+
+      for (int in_c = c_start; in_c < c_end; ++in_c) {
+        // "k" tracks the kernel's index for our input-kernel computation.
+        // It reads out-of-bound zeros, but trying to avoid them complicates
+        // for-loop conditions, which results in worse performance.
+        for (int k = 0; k < kernel_size; k += 4) {
+          // Since the weight tensor is width-packed, which is along the length
+          // dimension, we can batch-read four elements at a time.
+          const ivec3 w_pos = ivec3(k / 4, in_c % in_group_size, out_c);
+          const vec4 weight = texelFetch(kernel_in, w_pos, 0);
+
+          const ivec3 in_pos_0 = ivec3(in_l + k * dilation, in_c, n / 4);
+          sum = fma(weight.xxxx, texelFetch(image_in, in_pos_0, 0), sum);
+
+          const ivec3 in_pos_1 = ivec3(in_l + (k+1) * dilation, in_c, n / 4);
+          sum = fma(weight.yyyy, texelFetch(image_in, in_pos_1, 0), sum);
+
+          const ivec3 in_pos_2 = ivec3(in_l + (k+2) * dilation, in_c, n / 4);
+          sum = fma(weight.zzzz, texelFetch(image_in, in_pos_2, 0), sum);
+
+          const ivec3 in_pos_3 = ivec3(in_l + (k+3) * dilation, in_c, n / 4);
+          sum = fma(weight.wwww, texelFetch(image_in, in_pos_3, 0), sum);
+        }
       }
 
-      v += w * input_value.x;
+      ivec3 out_pos = ivec3(out_l, out_c, n / 4);
+      imageStore(image_out, out_pos, sum + bias.x);
     }
-
-    ivec3 out_pos = ivec3(i, c, 0);
-    imageStore(image_out, out_pos, vec4(v.x + bias.x, 0, 0, 0));
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -61,6 +61,11 @@ void resize_conv1d_node(
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr self = graph->get_tensor(args[1].refs[0]);
   TensorRefPtr weight_ref = graph->get_tref(extra_args[0]);
+
+  int64_t stride_size = graph->get_int_list(extra_args[1])->at(0);
+  int64_t padding_size = graph->get_int_list(extra_args[2])->at(0);
+  int64_t dilation_size = graph->get_int_list(extra_args[3])->at(0);
+
   const std::vector<int64_t>& weight_sizes = weight_ref->sizes;
 
   const std::vector<int64_t>& in_sizes = self->sizes();
@@ -71,8 +76,11 @@ void resize_conv1d_node(
   int64_t in_length = in_sizes.at(2);
 
   new_out_sizes.at(0) = in_sizes.at(0);
-  new_out_sizes.at(1) = in_sizes.at(1);
-  new_out_sizes.at(2) = in_length - kernel_size + 1;
+  new_out_sizes.at(1) = weight_sizes.at(0);
+  new_out_sizes.at(2) =
+      (in_length + 2 * padding_size - dilation_size * (kernel_size - 1) - 1) /
+          stride_size +
+      1;
 
   out->virtual_resize(new_out_sizes);
 }
@@ -244,10 +252,6 @@ ValueRef prepack_weights(
 }
 
 void check_conv_args(const vTensor& in, const vTensor& out) {
-  if (in.sizes().at(0) > 1) {
-    VK_THROW(
-        "aten.convolution.default: input batch size > 1 is not supported yet!");
-  }
   VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
   VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
 }
@@ -342,6 +346,9 @@ void add_conv2d_node(
 
   vTensorPtr t_in = graph.get_tensor(arg_in);
   vTensorPtr t_out = graph.get_tensor(out);
+  if (t_in->sizes().at(0) > 1) {
+    VK_THROW("conv2d: input batch size > 1 is not supported yet!");
+  }
   check_conv_args(*t_in, *t_out);
 
   api::utils::uvec3 global_size = t_out->extents();
@@ -395,8 +402,7 @@ void add_conv1d_node(
     const ValueRef groups,
     const ValueRef out) {
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
-  ValueRef arg_weight =
-      prepack_if_tensor_ref(graph, weight, graph.memory_layout_of(arg_in));
+  ValueRef arg_weight = prepack_if_tensor_ref(graph, weight, api::kWidthPacked);
   ValueRef arg_bias = prepack_biases(
       graph,
       bias,
@@ -414,35 +420,21 @@ void add_conv1d_node(
   std::vector<int64_t> in_sizes = t_in->sizes();
   std::vector<int64_t> weight_sizes = t_weight->sizes();
   std::vector<int64_t> out_sizes = t_out->sizes();
-  IntListPtr stride_sizes = graph.get_int_list(stride);
-  IntListPtr padding_sizes = graph.get_int_list(padding);
-  IntListPtr dilation_sizes = graph.get_int_list(dilation);
-  int64_t weight_out_channels = weight_sizes.at(0);
+
+  int64_t in_channels = in_sizes.at(1);
+  int64_t out_channels = weight_sizes.at(0);
   int64_t kernel_size = weight_sizes.at(2);
   int64_t in_length = in_sizes.at(2);
-
-  VK_CHECK_COND(in_sizes.size() == 3, "input must be a 3-dim tensor");
-  VK_CHECK_COND(weight_sizes.size() == 3, "weight must be a 3-dim tensor");
-  VK_CHECK_COND(
-      stride_sizes->size() == 1 && stride_sizes->at(0) == 1,
-      "stride must be 1");
-  VK_CHECK_COND(
-      padding_sizes->size() == 1 && padding_sizes->at(0) == 0,
-      "padding must be 0");
-  VK_CHECK_COND(
-      dilation_sizes->size() == 1 && dilation_sizes->at(0) == 1,
-      "dilation must be 1");
-  VK_CHECK_COND(
-      groups_val == in_sizes.at(1), "groups must be equal to in_channels");
-  VK_CHECK_COND(
-      groups_val == weight_sizes.at(0),
-      "groups must be equal to weight_sizes.at(0)");
-  VK_CHECK_COND(weight_sizes.at(1) == 1, "weight_sizes.at(1) must be 1");
+  int64_t stride_size = graph.get_int_list(stride)->at(0);
+  int64_t padding_size = graph.get_int_list(padding)->at(0);
+  int64_t dilation_size = graph.get_int_list(dilation)->at(0);
+  int64_t in_group_size = static_cast<int64_t>(in_channels / groups_val);
+  int64_t out_group_size = static_cast<int64_t>(out_channels / groups_val);
+  int64_t batch_size = in_sizes.at(0);
 
   check_conv_args(*t_in, *t_out);
 
-  api::utils::uvec3 global_size = {
-      1, static_cast<uint32_t>(weight_out_channels), 1};
+  api::utils::uvec3 global_size = {1, static_cast<uint32_t>(out_channels), 1};
   api::utils::uvec3 local_size = {1, 1, 1};
 
   std::string kernel_name("conv1d");
@@ -460,15 +452,20 @@ void add_conv1d_node(
        {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
       // Shader params buffers
       {
-          graph.create_params_buffer(weight_out_channels),
           graph.create_params_buffer(in_length),
           graph.create_params_buffer(kernel_size),
+          graph.create_params_buffer(stride_size),
+          graph.create_params_buffer(padding_size),
+          graph.create_params_buffer(dilation_size),
+          graph.create_params_buffer(in_group_size),
+          graph.create_params_buffer(out_group_size),
+          graph.create_params_buffer(batch_size),
       },
       // Specialization Constants
       {},
       // Resizing Logic
       resize_conv1d_node,
-      {weight}));
+      {weight, stride, padding, dilation}));
 }
 
 void conv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -135,6 +135,17 @@ def get_conv_inputs():
                 [0],
                 6,
             ),
+            (
+                (2, 20, 30),
+                (10, 4, 6),
+                (10,),
+                [5],
+                [5],
+                [3],
+                False,
+                [0],
+                5,
+            ),
             (
                 (1, 9, 11),
                 (9, 1, 3),
@@ -146,6 +157,17 @@ def get_conv_inputs():
                 [0],
                 9,
             ),
+            (
+                (5, 15, 30),
+                (20, 3, 3),
+                None,
+                [3],
+                [5],
+                [7],
+                False,
+                [0],
+                5,
+            ),
         ]
     )
     return test_suite
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -653,18 +653,21 @@ class Conv1dModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.conv = torch.nn.Conv1d(
-                    in_channels=6,
-                    out_channels=6,
-                    kernel_size=3,
-                    groups=6,
+                    in_channels=20,
+                    out_channels=10,
+                    kernel_size=6,
+                    stride=5,
+                    padding=5,
+                    dilation=3,
+                    groups=5,
                     bias=True,
                 )
 
             def forward(self, x):
                 return self.conv(x)
 
         conv1d_module = Conv1dModule()
-        sample_inputs = (torch.randn(size=(1, 6, 7), dtype=torch.float32),)
+        sample_inputs = (torch.randn(size=(3, 20, 30), dtype=torch.float32),)
 
         self.lower_module_and_test_output(
             conv1d_module,