Update base for Update on "[ET-VK][Ops] aten.convolution (SlidingWindow)"

jorgep31415 · jorgep31415 · commit 6c50546d3128 · 2024-04-03T19:35:48.000-07:00
## The Operator `nn.Module` invocations of [`nn.Conv2d`](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d) and [`nn.ConvTranspose2d`](https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html#torch.nn.ConvTranspose2d) get compiled to `aten.convolution.default` in the Edge Dialect, which carries the signature ``` - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor ``` ## Summary (cases handled) We introduce support for the convolution cases covered by [ATen-VK's default SlidingWindow implementation](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L73). This is achieved by - reusing the [existing `conv2d.glsl`](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/glsl/conv2d.glsl), and - [moving special weights prepacking from CPU](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L134-L235) to the GPU in `conv2d_prepack_weights.glsl`. We also include resizing support for dynamic shapes. Note that only height and width of the input can vary. ## Cases not handled The implementation is on-par with ATen-VK's SlidingWindow. This means the following cases are missing: 1. **Groups G > 1.** Largely not covered by ATen-VK. `G = in_channels` is covered by ATen-VK's Depthwise impl and will be added soon. 2. **Batch (input) N > 1.** Not covered by ATen-VK. 3. **Padding > 0 while Dilation, Kernel > 1.** Not covered by ATen-VK. ## Coming soon 1. Transpose convolution 2. Depthwise convolution (for completeness) 3. Pointwise convolution (for optimization) 4. Null bias Differential Revision: [D55346778](https://our.internmc.facebook.com/intern/diff/D55346778/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -132,16 +132,27 @@ ValueRef ComputeGraph::add_tensor(
       sizes, dtype, suggested_storage_type(), memory_layout, shared_object_idx);
 }
 
+ValueRef ComputeGraph::add_tensor_like(
+    const ValueRef vref,
+    const api::StorageType storage_type,
+    const api::GPUMemoryLayout memory_layout) {
+  TensorRef& tref = get_val(vref).toTensorRef();
+  return add_tensor(tref.sizes, tref.dtype, storage_type, memory_layout);
+}
+
+ValueRef ComputeGraph::add_tensor_like(
+    const ValueRef vref,
+    const api::GPUMemoryLayout memory_layout) {
+  TensorRef& tref = get_val(vref).toTensorRef();
+  return add_tensor(tref.sizes, tref.dtype, memory_layout);
+}
+
 ValueRef ComputeGraph::add_tensor(
     const std::vector<int64_t>& sizes,
     const api::ScalarType dtype,
     const int64_t shared_object_idx) {
   return add_tensor(
-      sizes,
-      dtype,
-      suggested_storage_type(),
-      suggested_memory_layout(sizes),
-      shared_object_idx);
+      sizes, dtype, suggested_memory_layout(sizes), shared_object_idx);
 }
 
 ValueRef ComputeGraph::add_tensorref(
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -172,7 +172,7 @@ class ComputeGraph final {
       const api::ScalarType dtype,
       const api::StorageType storage_type,
       const api::GPUMemoryLayout memory_layout,
-      const int64_t shared_object_idx);
+      const int64_t shared_object_idx = -1);
 
   /*
    * Add a `vTensor` value to the graph with the specified properties. The
@@ -191,9 +191,25 @@ class ComputeGraph final {
    */
   ValueRef add_tensor(
       const std::vector<int64_t>& sizes,
-      const api::ScalarType dtype = api::ScalarType::Float,
+      const api::ScalarType dtype,
       const int64_t shared_object_idx = -1);
 
+  /*
+   * Add a `vTensor` value to the graph with the properties of `vref`.
+   */
+  ValueRef add_tensor_like(
+      const ValueRef vref,
+      const api::StorageType storage_type,
+      const api::GPUMemoryLayout memory_layout);
+
+  /*
+   * Add a `vTensor` value to the graph with the properties of `vref`. The
+   * suggested storage type will be used to construct the `vTensor`.
+   */
+  ValueRef add_tensor_like(
+      const ValueRef vref,
+      const api::GPUMemoryLayout memory_layout);
+
   /*
    * Add a `TensorRef` value to the graph with the specific properties. A
    * `TensorRef` is a reference to a `vTensor` whose data is stored in an
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -36,8 +36,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
   api::PipelineBarrier pipeline_barrier{};
 
-  TensorRef tref = graph->get_val(tref_).toTensorRef();
-  vTensor packed = graph->get_val(packed_).toTensor();
+  TensorRef& tref = graph->get_val(tref_).toTensorRef();
+  vTensor& packed = graph->get_val(packed_).toTensor();
 
   size_t numel = api::utils::multiply_integers(tref.sizes);
   api::StorageBuffer staging(graph->context(), tref.dtype, numel);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -28,38 +28,23 @@ void resize_max_pool2d_node(
   size_t ndim = self.sizes().size();
   std::vector<int64_t> new_out_sizes(ndim);
 
-  // Batch
+  // Batch, Channel
   if (ndim == 4) {
     new_out_sizes.at(ndim - 4) = self.sizes().at(ndim - 4);
   }
-  // Channel
   new_out_sizes.at(ndim - 3) = self.sizes().at(ndim - 3);
 
-  const auto kernel_size = reverse(*graph, extra_args[0]);
-  const auto stride = reverse(*graph, extra_args[1]);
-  const auto padding = reverse(*graph, extra_args[2]);
-  const auto dilation = reverse(*graph, extra_args[3]);
-  const bool ceil_mode = graph->get_val(extra_args[4]).toBool();
-
-  // Height
-  new_out_sizes.at(ndim - 2) = calc_out_size(
-      self.sizes().at(ndim - 2),
-      kernel_size.data[1],
-      stride.data[1],
-      padding.data[1],
-      dilation.data[1],
-      ceil_mode);
-  // Width
-  new_out_sizes.at(ndim - 1) = calc_out_size(
-      self.sizes().at(ndim - 1),
-      kernel_size.data[0],
-      stride.data[0],
-      padding.data[0],
-      dilation.data[0],
-      ceil_mode);
-
-  VK_CHECK_COND(new_out_sizes.at(ndim - 2) >= 1);
-  VK_CHECK_COND(new_out_sizes.at(ndim - 1) >= 1);
+  // Height, Width
+  const auto new_out_sizes_hw = calc_out_sizes_hw(
+      *graph,
+      self.sizes(),
+      extra_args[0],
+      extra_args[1],
+      extra_args[2],
+      extra_args[3],
+      extra_args[4]);
+  new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0);
+  new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1);
 
   out.virtual_resize(new_out_sizes);
   indices.virtual_resize(new_out_sizes);
@@ -96,12 +81,8 @@ void add_max_pool2d_node(
   kernel_name << "max_pool2d";
   apply_dtype_suffix(kernel_name, t_out);
 
-  KernelParams kernel_params{
-      reverse(graph, kernel_size),
-      reverse(graph, stride),
-      reverse(graph, padding),
-      reverse(graph, dilation),
-  };
+  KernelParams kernel_params =
+      create_kernel_params(graph, kernel_size, stride, padding, dilation);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -63,9 +63,8 @@ ValueRef prepack(
     ComputeGraph& graph,
     const ValueRef vref,
     const api::GPUMemoryLayout layout) {
-  TensorRef& tref = graph.get_val(vref).toTensorRef();
-  ValueRef v = graph.add_tensor(tref.sizes, tref.dtype, layout);
-  vTensor t = graph.get_val(v).toTensor();
+  ValueRef v = graph.add_tensor_like(vref, layout);
+  vTensor& t = graph.get_val(v).toTensor();
 
   api::ShaderInfo shader = get_nchw_to_image_shader(t);
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
@@ -120,7 +120,7 @@ void add_sum_dim_IntList(
   vTensor& in_tensor = graph.get_val(in).toTensor();
 
   std::set<int64_t> dims_set;
-  auto dims_to_sum = graph.get_val(opt_dim).toIntList();
+  const auto& dims_to_sum = graph.get_val(opt_dim).toIntList();
   int64_t in_dim = in_tensor.sizes().size();
 
   for (const auto& dim : dims_to_sum) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -82,8 +82,8 @@ float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) {
     return add_unary_op_node(                                            \
         graph,                                                           \
         args[0],                                                         \
-        get_val_or_inf(graph, args[1], /*max =*/false),                  \
-        get_val_or_inf(graph, args[2], /*max =*/true),                   \
+        get_val_or_inf(graph, args[1], /*max = */ false),                \
+        get_val_or_inf(graph, args[2], /*max = */ true),                 \
         args[3],                                                         \
         kClampShaderName);                                               \
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
@@ -10,25 +10,80 @@
 
 namespace vkcompute {
 
+api::utils::ivec2 make_ivec2_from_list(ComputeGraph& graph, ValueRef vref) {
+  return api::utils::make_ivec2(
+      graph.get_val(vref).toIntList(), /*reverse = */ true);
+}
+
+KernelParams create_kernel_params(
+    ComputeGraph& graph,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation) {
+  return {
+      make_ivec2_from_list(graph, kernel_size),
+      make_ivec2_from_list(graph, stride),
+      make_ivec2_from_list(graph, padding),
+      make_ivec2_from_list(graph, dilation),
+  };
+}
+
 int64_t calc_out_size(
     const int64_t in_size,
-    const int64_t kernel,
+    const int64_t kernel_size,
     const int64_t stride,
     const int64_t padding,
     const int64_t dilation,
     const bool ceil_mode) {
   int64_t c = ceil_mode ? stride - 1 : 0;
   int64_t out_size =
-      (in_size + 2 * padding - dilation * (kernel - 1) - 1 + c) / stride + 1;
+      (in_size + 2 * padding - dilation * (kernel_size - 1) - 1 + c) / stride +
+      1;
   if (ceil_mode && (out_size - 1) * stride >= in_size + padding) {
     --out_size;
   }
   return out_size;
 }
 
-api::utils::ivec2 reverse(ComputeGraph& graph, ValueRef vref) {
-  return api::utils::make_ivec2(
-      graph.get_val(vref).toIntList(), /*reverse=*/true);
+std::vector<int64_t> calc_out_sizes_hw(
+    ComputeGraph& graph,
+    const std::vector<int64_t>& in_sizes,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef ceil_mode) {
+  const int64_t ndim = in_sizes.size();
+  std::vector<int64_t> out_sizes(2);
+
+  const auto kernel_vec = make_ivec2_from_list(graph, kernel_size);
+  const auto stride_vec = make_ivec2_from_list(graph, stride);
+  const auto padding_vec = make_ivec2_from_list(graph, padding);
+  const auto dilation_vec = make_ivec2_from_list(graph, dilation);
+  const bool ceil_mode_val = graph.get_val(ceil_mode).toBool();
+
+  // Height
+  out_sizes.at(0) = calc_out_size(
+      in_sizes.at(ndim - 2),
+      kernel_vec.data[1],
+      stride_vec.data[1],
+      padding_vec.data[1],
+      dilation_vec.data[1],
+      ceil_mode_val);
+  // Width
+  out_sizes.at(1) = calc_out_size(
+      in_sizes.at(ndim - 1),
+      kernel_vec.data[0],
+      stride_vec.data[0],
+      padding_vec.data[0],
+      dilation_vec.data[0],
+      ceil_mode_val);
+
+  VK_CHECK_COND(out_sizes.at(0) >= 1);
+  VK_CHECK_COND(out_sizes.at(1) >= 1);
+
+  return out_sizes;
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
@@ -23,14 +23,20 @@ struct KernelParams final {
   api::utils::ivec2 dilation;
 };
 
-int64_t calc_out_size(
-    const int64_t in_size,
-    const int64_t kernel_size,
-    const int64_t stride,
-    const int64_t padding,
-    const int64_t dilation,
-    const bool ceil_mode);
-
-api::utils::ivec2 reverse(ComputeGraph& graph, ValueRef vref);
+KernelParams create_kernel_params(
+    ComputeGraph& graph,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation);
+
+std::vector<int64_t> calc_out_sizes_hw(
+    ComputeGraph& graph,
+    const std::vector<int64_t>& in_sizes,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef ceil_mode);
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -401,7 +401,7 @@ TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) {
   ComputeGraph graph(config);
 
   ValueRef idx = graph.add_scalar_list<int64_t>({1, 2, 3, 4});
-  std::vector<int64_t>& arr = graph.get_val(idx).toIntList();
+  const auto& arr = graph.get_val(idx).toIntList();
   EXPECT_TRUE(arr.size() == 4);
   for (int i = 0; i < 4; i++) {
     EXPECT_TRUE(arr[i] == i + 1);
@@ -417,7 +417,7 @@ TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) {
     std::vector<double> data = {5.0, 4.0, 3.0, 2.0, 1.0};
     idx = graph.add_scalar_list(std::move(data));
   }
-  std::vector<double>& arr = graph.get_val(idx).toDoubleList();
+  const auto& arr = graph.get_val(idx).toDoubleList();
   EXPECT_TRUE(arr.size() == 5);
   for (int i = 0; i < 5; i++) {
     EXPECT_TRUE(arr[i] == (5 - i));
@@ -1044,11 +1044,39 @@ void test_mm(
 }
 
 TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
-#define RUN_TESTS(dtype, layout, prepack)                                  \
-  test_mm(/*B=*/1, /*M=*/31, /*K=*/127, /*N=*/23, dtype, layout, prepack); \
-  test_mm(/*B=*/5, /*M=*/31, /*K=*/127, /*N=*/23, dtype, layout, prepack); \
-  test_mm(/*B=*/7, /*M=*/13, /*K=*/89, /*N=*/17, dtype, layout, prepack);  \
-  test_mm(/*B=*/1, /*M=*/13, /*K=*/89, /*N=*/17, dtype, layout, prepack);
+#define RUN_TESTS(dtype, layout, prepack) \
+  test_mm(                                \
+      /*B = */ 1,                         \
+      /*M = */ 31,                        \
+      /*K = */ 127,                       \
+      /*N = */ 23,                        \
+      dtype,                              \
+      layout,                             \
+      prepack);                           \
+  test_mm(                                \
+      /*B = */ 5,                         \
+      /*M = */ 31,                        \
+      /*K = */ 127,                       \
+      /*N = */ 23,                        \
+      dtype,                              \
+      layout,                             \
+      prepack);                           \
+  test_mm(                                \
+      /*B = */ 7,                         \
+      /*M = */ 13,                        \
+      /*K = */ 89,                        \
+      /*N = */ 17,                        \
+      dtype,                              \
+      layout,                             \
+      prepack);                           \
+  test_mm(                                \
+      /*B = */ 1,                         \
+      /*M = */ 13,                        \
+      /*K = */ 89,                        \
+      /*N = */ 17,                        \
+      dtype,                              \
+      layout,                             \
+      prepack);
 
   CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
   CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);
@@ -1102,7 +1130,7 @@ void test_max_pool2d(
 
   // Run graph
 
-  fill_vtensor(graph, graph.inputs().at(0), base_val, /*iota=*/true);
+  fill_vtensor(graph, graph.inputs().at(0), base_val, /*iota = */ true);
 
   vTensor& t_in = graph.get_val(in_ioval.value).toTensor();
   std::vector<float> input_data(t_in.gpu_numel());
@@ -1140,7 +1168,7 @@ void test_max_pool2d(
 TEST(VulkanComputeGraphOpsTest, max_pool2d_smoke_test) {
   std::vector<int64_t> kernel = {2, 3};
   test_max_pool2d(
-      /*in_size=*/{1, 4, 6},
-      /*base_val=*/10.0f,
+      /*in_size = */ {1, 4, 6},
+      /*base_val = */ 10.0f,
       kernel);
 }