Update on "[ET-VK][Ops] aten.convolution (SlidingWindow)"

jorgep31415 · jorgep31415 · commit 9d3b71441c1a · 2024-04-05T15:16:12.000-07:00
## The Operator `nn.Module` invocations of [`nn.Conv2d`](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d) and [`nn.ConvTranspose2d`](https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html#torch.nn.ConvTranspose2d) get compiled to `aten.convolution.default` in the Edge Dialect, which carries the signature ``` - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor ``` ## Summary (cases handled) We introduce support for the convolution cases covered by [ATen-VK's default SlidingWindow implementation](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L73). This is achieved by - reusing the [existing `conv2d.glsl`](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/glsl/conv2d.glsl), and - [moving special weights prepacking from CPU](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L134-L235) to the GPU in `conv2d_prepack_weights.glsl`. We also include resizing support for dynamic shapes. Note that only height and width of the input can vary. ## Cases not handled The implementation is on-par with ATen-VK's SlidingWindow. This means the following cases are missing: 1. **Groups G > 1.** Largely not covered by ATen-VK. `G = in_channels` is covered by ATen-VK's Depthwise impl and will be added soon. 2. **Batch (input) N > 1.** Not covered by ATen-VK. 3. **Padding > 0 while Dilation, Kernel > 1.** Not covered by ATen-VK. ## Coming soon 1. Transpose convolution 2. Depthwise convolution (for completeness) 3. Pointwise convolution (for optimization) 4. Null bias Differential Revision: [D55346778](https://our.internmc.facebook.com/intern/diff/D55346778/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -84,6 +84,7 @@ void main() {
     for (int y = start.y, ky = kstart.y; y < end.y; y += params.dilation.y, ++ky) {
       for (int x = start.x, kx = kstart.x; x < end.x; x += params.dilation.x, kx += 4) {
         const ${VEC4_T[DTYPE]} in_texel = texelFetch(image_in, ivec3(x, y, z4), 0);
+        const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
 
         // To explain the calculation below, the contents of in_texel and the
         // group of 4 texels loaded from kernel_in are shown:
@@ -117,17 +118,10 @@ void main() {
         //
         // which is expressed in the following statements.
 
-        const ${VEC4_T[DTYPE]} ktex_0 = texelFetch(kernel_in, ivec2(kx + 0, ky), 0);
-        sum = fma(in_texel.xxxx, ktex_0, sum);
-
-        const ${VEC4_T[DTYPE]} ktex_1 = texelFetch(kernel_in, ivec2(kx + 1, ky), 0);
-        sum = fma(in_texel.yyyy, ktex_1, sum);
-
-        const ${VEC4_T[DTYPE]} ktex_2 = texelFetch(kernel_in, ivec2(kx + 2, ky), 0);
-        sum = fma(in_texel.zzzz, ktex_2, sum);
-
-        const ${VEC4_T[DTYPE]} ktex_3 = texelFetch(kernel_in, ivec2(kx + 3, ky), 0);
-        sum = fma(in_texel.wwww, ktex_3, sum);
+        sum = fma(in_texel.xxxx, texelFetch(kernel_in, ivec2(kxs.x, ky), 0), sum);
+        sum = fma(in_texel.yyyy, texelFetch(kernel_in, ivec2(kxs.y, ky), 0), sum);
+        sum = fma(in_texel.zzzz, texelFetch(kernel_in, ivec2(kxs.z, ky), 0), sum);
+        sum = fma(in_texel.wwww, texelFetch(kernel_in, ivec2(kxs.w, ky), 0), sum);
       }
     }
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
@@ -32,7 +32,7 @@ layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
 }
 original_sizes;
 
-// Corresponds to {3,3,8,12} in the example below.
+// Corresponds to {8,12} in the example below.
 layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
   ivec2 data;
 }
@@ -53,7 +53,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * 1. Pad the N and C dims so that both are a multiple of 4. In this case, 2
  * batches and 1 channel of padding are added, producing a tensor of size
  * {12,8,3,3}.
- *      at::pad(x, {0,0,0,0,0,2,0,1}, "constant", 0);
+ *      at::pad(x, {0,0,0,0,0,1,0,2}, "constant", 0);
  *
  * 2. Split the tensor along the C dim so that each split has 4 channels.
  *      x.reshape({12,2,4,3,3});
@@ -94,8 +94,8 @@ void main() {
       base_index + ivec4(0, 1, 2, 3) * STRIDE_CHANNELS_PACKED(gpu_sizes.data);
 
   // Re-map the normal CPU buffer indices to special indices, through a series
-  // of mappings: reshape is a no-op to the underlying indices, pad is hard, and
-  // permute is one of the hardest math problems I've ever solved.
+  // of mappings: reshape is a no-op to the underlying indices, so we only map
+  // for pad and permute.
   const int Np = padded_sizes.data.y;
   const int Cp = padded_sizes.data.x;
   const int N = original_sizes.data.w;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp
@@ -41,7 +41,7 @@ void resize_conv2d_node(
       *graph,
       self.sizes(),
       extra_args[0],
-      /*kernel_only = */ false,
+      /*kernel_size_only = */ false,
       extra_args[1],
       extra_args[2],
       extra_args[3]);
@@ -56,13 +56,11 @@ ValueRef prepack_biases(ComputeGraph& graph, const ValueRef vref) {
     VK_THROW("aten.convolution.default: Null bias is not supported yet!");
   }
 
-  TensorRef& tref = graph.get_val(vref).toTensorRef();
-  ValueRef v = graph.add_tensor(
-      tref.sizes,
-      tref.dtype,
+  ValueRef v = graph.add_tensor_like(
+      vref,
       api::StorageType::TEXTURE_2D,
       api::GPUMemoryLayout::TENSOR_WIDTH_PACKED);
-  vTensor t = graph.get_val(v).toTensor();
+  vTensor& t = graph.get_val(v).toTensor();
 
   api::ShaderInfo shader = get_nchw_to_image_shader(t);
 
@@ -110,7 +108,7 @@ ValueRef prepack_weights(ComputeGraph& graph, const ValueRef vref) {
       graph.get_val(vref).toTensorRef().dtype,
       api::StorageType::TEXTURE_2D,
       api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
-  vTensor t = graph.get_val(v).toTensor();
+  vTensor& t = graph.get_val(v).toTensor();
 
   api::utils::uvec3 global_size = t.extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
@@ -163,7 +161,7 @@ Conv2dParams create_conv2d_params(
   });
   const auto weight_sizes = graph.get_val(weight).toTensorRef().sizes;
   const int32_t in_group_size = api::utils::safe_downcast<int32_t>(
-      api::utils::align_up(weight_sizes.at(0), INT64_C(4)));
+      api::utils::align_up(weight_sizes.at(1), INT64_C(4)));
   return {overlay_region, in_group_size};
 }
 
@@ -187,21 +185,21 @@ void add_conv2d_node(
     const ValueRef dilation,
     const ValueRef out) {
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
+  ValueRef arg_weight = prepack_weights(graph, weight);
+  ValueRef arg_bias = prepack_biases(graph, bias);
+
   vTensor& t_in = graph.get_val(arg_in).toTensor();
   vTensor& t_out = graph.get_val(out).toTensor();
 
   check_conv2d_args(t_in, t_out);
 
-  ValueRef arg_weight = prepack_weights(graph, weight);
-  ValueRef arg_bias = prepack_biases(graph, bias);
-
   api::utils::uvec3 global_size = t_out.virtual_extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   KernelParams kernel_params = create_kernel_params(
       graph,
       weight,
-      /*kernel_only = */ false,
+      /*kernel_size_only = */ false,
       stride,
       padding,
       dilation);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -39,7 +39,7 @@ void resize_max_pool2d_node(
       *graph,
       self.sizes(),
       extra_args[0],
-      /*kernel_only = */ true,
+      /*kernel_size_only = */ true,
       extra_args[1],
       extra_args[2],
       extra_args[3],
@@ -83,7 +83,12 @@ void add_max_pool2d_node(
   apply_dtype_suffix(kernel_name, t_out);
 
   KernelParams kernel_params = create_kernel_params(
-      graph, kernel_size, /*kernel_only = */ true, stride, padding, dilation);
+      graph,
+      kernel_size,
+      /*kernel_size_only = */ true,
+      stride,
+      padding,
+      dilation);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
@@ -18,8 +18,8 @@ api::utils::ivec2 make_ivec2_from_list(ComputeGraph& graph, ValueRef vref) {
 api::utils::ivec2 make_ivec2_kernel_size(
     ComputeGraph& graph,
     const ValueRef weight,
-    const bool kernel_only) {
-  if (kernel_only) {
+    const bool kernel_size_only) {
+  if (kernel_size_only) {
     return make_ivec2_from_list(graph, weight);
   } else {
     const auto weight_sizes = graph.get_val(weight).toTensorRef().sizes;
@@ -30,12 +30,12 @@ api::utils::ivec2 make_ivec2_kernel_size(
 KernelParams create_kernel_params(
     ComputeGraph& graph,
     const ValueRef weight,
-    const bool kernel_only,
+    const bool kernel_size_only,
     const ValueRef stride,
     const ValueRef padding,
     const ValueRef dilation) {
   return {
-      make_ivec2_kernel_size(graph, weight, kernel_only),
+      make_ivec2_kernel_size(graph, weight, kernel_size_only),
       make_ivec2_from_list(graph, stride),
       make_ivec2_from_list(graph, padding),
       make_ivec2_from_list(graph, dilation),
@@ -63,15 +63,16 @@ std::vector<int64_t> calc_out_sizes_hw(
     ComputeGraph& graph,
     const std::vector<int64_t>& in_sizes,
     const ValueRef weight,
-    const bool kernel_only,
+    const bool kernel_size_only,
     const ValueRef stride,
     const ValueRef padding,
     const ValueRef dilation,
     const ValueRef ceil_mode) {
   const int64_t ndim = in_sizes.size();
   std::vector<int64_t> out_sizes(2);
 
-  const auto kernel_vec = make_ivec2_kernel_size(graph, weight, kernel_only);
+  const auto kernel_vec =
+      make_ivec2_kernel_size(graph, weight, kernel_size_only);
   const auto stride_vec = make_ivec2_from_list(graph, stride);
   const auto padding_vec = make_ivec2_from_list(graph, padding);
   const auto dilation_vec = make_ivec2_from_list(graph, dilation);
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
@@ -26,7 +26,7 @@ struct KernelParams final {
 KernelParams create_kernel_params(
     ComputeGraph& graph,
     const ValueRef weight,
-    const bool kernel_only,
+    const bool kernel_size_only,
     const ValueRef stride,
     const ValueRef padding,
     const ValueRef dilation);
@@ -35,7 +35,7 @@ std::vector<int64_t> calc_out_sizes_hw(
     ComputeGraph& graph,
     const std::vector<int64_t>& in_sizes,
     const ValueRef weight,
-    const bool kernel_only,
+    const bool kernel_size_only,
     const ValueRef stride,
     const ValueRef padding,
     const ValueRef dilation,