conv1d, special case

copyrightly · facebook-github-bot · commit 1d467d0a6f8a · 2024-04-19T17:22:37.000-07:00
Summary:
We follow D50914117 to implement a specific case of conv1d for our needs. Specifically, we require
- the input tensor to have a single batch
- groups == in_channels == out_channels
- weight_sizes.at(1) == 1
- stride == 1
- padding == 0
- dilation == 1

We assume `bias==True`. The `bias==False` case in handled in the next diff.

General cases and optimizations will be enabled later.

Reviewed By: jorgep31415

Differential Revision: D56220143

fbshipit-source-id: a18de3a463875b9617cb7930febf7622fe866536
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict Out_channels {
+  int data;
+}
+out_channels;
+
+layout(set = 0, binding = 5) uniform PRECISION restrict In_length {
+  int data;
+}
+in_length;
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Kernel_size {
+  int data;
+}
+kernel_size;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * This implementation optimize for simplicity (and partially performance) for a
+ * (1, C, L) where C == groups. Hence we only focus on calculating the rolling
+ * kernel of the L dimension.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // The global workgroup should have taken care of it. We only perform one
+  // work item for each 1d tensor on lengths
+  if (pos.x >= 1) {
+    return;
+  }
+
+  int c = pos.y;
+  if (c >= out_channels.data) {
+    return;
+  }
+
+  // Assume n = 1, do not handle n > 1 case for now.
+  int n = pos.z;
+  if (n >= 1) {
+    return;
+  }
+
+  vec4 bias = texelFetch(bias_in, ivec3(c, 0, 0), 0);
+
+  for (int i = 0; i < in_length.data - kernel_size.data + 1; ++i) {
+    vec4 v = vec4(0);
+    for (int k = 0; k < kernel_size.data; ++k) {
+      const ivec3 in_pos = ivec3(i+k, c, 0);
+      const vec4 input_value = texelFetch(image_in, in_pos, 0);
+
+      // Note that we are reading weight in the inner loop, this could be
+      // improved by moving it before the outer loop. Since the weight vector is
+      // contant for the entire call.
+
+      // weight in input-space: (c, 0, k);
+      // notice that c is 4-packed. We need to mod 4 to get the actual weight.
+      const ivec3 w_pos = ivec3(k, 0, c / 4);
+      const vec4 weight = texelFetch(kernel_in, w_pos, 0);
+
+      float w = weight.x;
+      if (c % 4 == 1) {
+        w = weight.y;
+      } else if (c % 4 == 2) {
+        w = weight.z;
+      } else if (c % 4 == 3) {
+        w = weight.w;
+      }
+
+      v += w * input_value.x;
+    }
+
+    ivec3 out_pos = ivec3(i, c, 0);
+    imageStore(image_out, out_pos, vec4(v.x + bias.x, 0, 0, 0));
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv1d:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: C_packed
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv1d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -17,8 +17,6 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-#include <iostream>
-
 namespace vkcompute {
 
 void resize_conv2d_node(
@@ -56,6 +54,29 @@ void resize_conv2d_node(
   out->virtual_resize(new_out_sizes);
 }
 
+void resize_conv1d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr self = graph->get_tensor(args[1].refs[0]);
+  TensorRefPtr weight_ref = graph->get_tref(extra_args[0]);
+  const std::vector<int64_t>& weight_sizes = weight_ref->sizes;
+
+  const std::vector<int64_t>& in_sizes = self->sizes();
+  size_t ndim = in_sizes.size();
+  std::vector<int64_t> new_out_sizes(ndim);
+
+  int64_t kernel_size = weight_sizes.at(2);
+  int64_t in_length = in_sizes.at(2);
+
+  new_out_sizes.at(0) = in_sizes.at(0);
+  new_out_sizes.at(1) = in_sizes.at(1);
+  new_out_sizes.at(2) = in_length - kernel_size + 1;
+
+  out->virtual_resize(new_out_sizes);
+}
+
 ValueRef prepack_biases(
     ComputeGraph& graph,
     const ValueRef vref,
@@ -219,7 +240,7 @@ ValueRef prepack_weights(
   return v;
 }
 
-void check_conv2d_args(const vTensor& in, const vTensor& out) {
+void check_conv_args(const vTensor& in, const vTensor& out) {
   if (in.sizes().at(0) > 1) {
     VK_THROW(
         "aten.convolution.default: input batch size > 1 is not supported yet!");
@@ -312,7 +333,7 @@ void add_conv2d_node(
 
   vTensorPtr t_in = graph.get_tensor(arg_in);
   vTensorPtr t_out = graph.get_tensor(out);
-  check_conv2d_args(*t_in, *t_out);
+  check_conv_args(*t_in, *t_out);
 
   api::utils::uvec3 global_size = t_out->extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
@@ -352,23 +373,121 @@ void add_conv2d_node(
       {weight, stride, padding, dilation, transposed, output_padding}));
 }
 
-void conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_conv2d_node(
+void add_conv1d_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef out) {
+  if (graph.val_is_none(bias)) {
+    VK_THROW("conv1d: Null bias is not supported yet!");
+  }
+
+  ValueRef arg_in = prepack_if_tensor_ref(graph, in);
+  ValueRef arg_weight =
+      prepack_if_tensor_ref(graph, weight, graph.memory_layout_of(arg_in));
+  ValueRef arg_bias =
+      prepack_if_tensor_ref(graph, bias, graph.memory_layout_of(arg_in));
+
+  vTensorPtr t_in = graph.get_tensor(arg_in);
+  vTensorPtr t_weight = graph.get_tensor(arg_weight);
+  vTensorPtr t_bias = graph.get_tensor(arg_bias);
+  vTensorPtr t_out = graph.get_tensor(out);
+  const int64_t groups_val = graph.get_int(groups);
+
+  std::vector<int64_t> in_sizes = t_in->sizes();
+  std::vector<int64_t> weight_sizes = t_weight->sizes();
+  std::vector<int64_t> out_sizes = t_out->sizes();
+  IntListPtr stride_sizes = graph.get_int_list(stride);
+  IntListPtr padding_sizes = graph.get_int_list(padding);
+  IntListPtr dilation_sizes = graph.get_int_list(dilation);
+  int64_t weight_out_channels = weight_sizes.at(0);
+  int64_t kernel_size = weight_sizes.at(2);
+  int64_t in_length = in_sizes.at(2);
+
+  VK_CHECK_COND(in_sizes.size() == 3, "input must be a 3-dim tensor");
+  VK_CHECK_COND(weight_sizes.size() == 3, "weight must be a 3-dim tensor");
+  VK_CHECK_COND(
+      stride_sizes->size() == 1 && stride_sizes->at(0) == 1,
+      "stride must be 1");
+  VK_CHECK_COND(
+      padding_sizes->size() == 1 && padding_sizes->at(0) == 0,
+      "padding must be 0");
+  VK_CHECK_COND(
+      dilation_sizes->size() == 1 && dilation_sizes->at(0) == 1,
+      "dilation must be 1");
+  VK_CHECK_COND(
+      groups_val == in_sizes.at(1), "groups must be equal to in_channels");
+  VK_CHECK_COND(
+      groups_val == weight_sizes.at(0),
+      "groups must be equal to weight_sizes.at(0)");
+  VK_CHECK_COND(weight_sizes.at(1) == 1, "weight_sizes.at(1) must be 1");
+
+  check_conv_args(*t_in, *t_out);
+
+  api::utils::uvec3 global_size = {
+      1, static_cast<uint32_t>(weight_out_channels), 1};
+  api::utils::uvec3 local_size = {1, 1, 1};
+
+  std::string kernel_name("conv1d");
+  kernel_name.reserve(kShaderNameReserve);
+
+  add_dtype_suffix(kernel_name, *t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
-      args[0],
-      args[1],
-      args[2],
-      args[3],
-      args[4],
-      args[5],
-      args[6],
-      args[7],
-      args[8],
-      args[9]);
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          graph.create_params_buffer(weight_out_channels),
+          graph.create_params_buffer(in_length),
+          graph.create_params_buffer(kernel_size),
+      },
+      // Resizing
+      resize_conv1d_node,
+      {weight}));
+}
+
+void conv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int64_t in_ndim = graph.get_tensor(args[0])->sizes().size();
+  if (in_ndim == 4) {
+    return add_conv2d_node(
+        graph,
+        args[0],
+        args[1],
+        args[2],
+        args[3],
+        args[4],
+        args[5],
+        args[6],
+        args[7],
+        args[8],
+        args[9]);
+  } else {
+    return add_conv1d_node(
+        graph,
+        args[0],
+        args[1],
+        args[2],
+        args[3],
+        args[4],
+        args[5],
+        args[8],
+        args[9]);
+  }
 }
 
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.convolution.default, conv2d);
+  VK_REGISTER_OP(aten.convolution.default, conv);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -66,7 +66,7 @@ def get_pool2d_inputs():
     return test_suite
 
 
-def get_conv2d_inputs():
+def get_conv_inputs():
     test_suite = VkTestSuite(
         [
             (
@@ -124,6 +124,17 @@ def get_conv2d_inputs():
                 [0, 0],
                 1,
             ),
+            (
+                (1, 6, 7),
+                (6, 1, 3),
+                (6,),
+                [1],
+                [0],
+                [1],
+                False,
+                [0],
+                6,
+            ),
         ]
     )
     return test_suite
@@ -297,7 +308,7 @@ def get_slice_inputs():
     "aten.mul.Tensor": get_binary_elementwise_inputs(),
     "aten.mm.default": get_mm_inputs(),
     "aten.max_pool2d_with_indices.default": get_pool2d_inputs(),
-    "aten.convolution.default": get_conv2d_inputs(),
+    "aten.convolution.default": get_conv_inputs(),
     "aten.native_layer_norm.default": get_native_layer_norm_inputs(),
     "aten.full.default": get_full_inputs(),
     "aten.select.int": get_select_int_inputs(),
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -648,6 +648,30 @@ def forward(self, x):
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
 
+    def test_vulkan_backend_conv1d(self):
+        class Conv1dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv1d(
+                    in_channels=6,
+                    out_channels=6,
+                    kernel_size=3,
+                    groups=6,
+                    bias=True,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv1d_module = Conv1dModule()
+        sample_inputs = (torch.randn(size=(1, 6, 7), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv1d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
     def test_vulkan_backend_native_layer_norm(self):
         class NativeLayerNormModule(torch.nn.Module):
             def __init__(self):