[ET-VK][Ops] aten.max_pool2d_with_indices

jorgep31415 · jorgep31415 · commit f5be29af720c · 2024-03-20T16:40:45.000-07:00
## The Operator An `nn.Module` invocation of `torch.nn.MaxPool2d()` is represented as `aten.max_pool2d_with_indices.default` in the Edge Dialect, indpendent of `use_indices = True/False`. ``` # Return: (Tensor output, Tensor indices) - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) ``` This is different from PT-VK where `torch.nn.MaxPool2d()` was represented as `aten.max_pool2d.default`. ``` - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor ``` The difference is we now return an additional tensor for the max indices. Still, much of the core logic is taken from [`max_pool2d.glsl`](https://github.com/pytorch/pytorch/blob/cceabe873f11c6611f627a3bb0055994952ec6b8/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl) and [`Pool.cpp`](https://github.com/pytorch/pytorch/blob/cceabe873f11c6611f627a3bb0055994952ec6b8/aten/src/ATen/native/vulkan/ops/Pool.cpp). We provide only a `CHANNELS_PACKED` implementation. ## The Smoke Test Given any input and kernel sizes, we fill the input tensor with increasing values, e.g., ``` tensor([[[10., 11., 12., 13., 14., 15.], [16., 17., 18., 19., 20., 21.], [22., 23., 24., 25., 26., 27.], [28., 29., 30., 31., 32., 33.]]]) ``` With this setup, the max number for each pool is always in the lower-right. We use the kernel size to compute the size of the lower-right block and verify that 1. the output tensor values match the lower-right block values, and 2. the index tensor values match the lower-right block indices. ``` tensor([[[ 18., 19., 20., 21.], [ 24., 25., 26., 27.], [ 30., 31., 32., 33.]]]) tensor([[[ 8, 9, 10, 11], [14, 15, 16, 17], [20, 21, 22, 23]]]) ``` Differential Revision: [D54961929](https://our.internmc.facebook.com/intern/diff/D54961929/) ghstack-source-id: 219490378 Pull Request resolved: #2547
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -41,6 +41,8 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.relu.default,
             # Matrix multiplication operators
             exir_ops.edge.aten.mm.default,
+            # Pooling operators
+            exir_ops.edge.aten.max_pool2d_with_indices.default,
             # Other
             operator.getitem,
         ]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define FLT_MIN -3.402823466e+38
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1, ${IMAGE_FORMAT["int"]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM]["int"]} image_idx;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict InExtents {
+  uvec4 data;
+}
+in_extents;
+
+layout(set = 0, binding = 5) uniform PRECISION restrict Params {
+  ivec2 kernel;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+}
+params;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+    return;
+  }
+
+  const ivec2 ipos = pos.xy * params.stride - params.padding;
+
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + params.kernel * params.dilation;
+
+  vec4 out_texel = vec4(FLT_MIN);
+  ivec4 idx_texel = ivec4(0);
+
+  for (int y = start.y; y < end.y; y += params.dilation.y) {
+    for (int x = start.x; x < end.x; x += params.dilation.x) {
+      if ((x >= 0 && x < in_extents.data.x) && (y >= 0 && y < in_extents.data.y)) {
+        const vec4 cur_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
+
+        const int cur_idx = x + int(in_extents.data.x) * y;
+        if (cur_texel.x > out_texel.x) {
+          idx_texel.x = cur_idx;
+        }
+        if (cur_texel.y > out_texel.y) {
+          idx_texel.y = cur_idx;
+        }
+        if (cur_texel.z > out_texel.z) {
+          idx_texel.z = cur_idx;
+        }
+        if (cur_texel.w > out_texel.w) {
+          idx_texel.w = cur_idx;
+        }
+        out_texel = max(cur_texel, out_texel);
+      }
+      else {
+        out_texel = max(vec4(FLT_MIN), out_texel);
+      }
+    }
+  }
+
+  imageStore(image_out, pos, out_texel);
+  imageStore(image_idx, pos, idx_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pool.yaml b/backends/vulkan/runtime/graph/ops/glsl/pool.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+max_pool2d:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: max_pool2d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void resize_max_pool2d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensor& out = graph->get_val(args[0].refs[0]).toTensor();
+  vTensor& indices = graph->get_val(args[0].refs[1]).toTensor();
+  vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
+
+  size_t ndim = self.sizes().size();
+  std::vector<int64_t> new_out_sizes(ndim);
+
+  // Batch
+  if (ndim == 4) {
+    new_out_sizes.at(ndim - 4) = self.sizes().at(ndim - 4);
+  }
+  // Channel
+  new_out_sizes.at(ndim - 3) = self.sizes().at(ndim - 3);
+
+  const auto kernel = normalize_wh(graph->get_val(extra_args[0]));
+  const auto stride = normalize_wh(graph->get_val(extra_args[1]));
+  const auto padding = normalize_wh(graph->get_val(extra_args[2]));
+  const auto dilation = normalize_wh(graph->get_val(extra_args[3]));
+  const bool ceil_mode = graph->get_val(extra_args[4]).toBool();
+
+  // Height
+  new_out_sizes.at(ndim - 2) = calc_out_size(
+      self.sizes().at(ndim - 2),
+      kernel.data[1],
+      stride.data[1],
+      padding.data[1],
+      dilation.data[1],
+      ceil_mode);
+  // Width
+  new_out_sizes.at(ndim - 1) = calc_out_size(
+      self.sizes().at(ndim - 1),
+      kernel.data[0],
+      stride.data[0],
+      padding.data[0],
+      dilation.data[0],
+      ceil_mode);
+
+  VK_CHECK_COND(new_out_sizes.at(ndim - 2) >= 1);
+  VK_CHECK_COND(new_out_sizes.at(ndim - 1) >= 1);
+
+  out.virtual_resize(new_out_sizes);
+  indices.virtual_resize(new_out_sizes);
+}
+
+void check_max_pool2d_args(const vTensor& in, const vTensor& out) {
+  VK_CHECK_COND(
+      check_memory_layout_is(in, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED));
+  VK_CHECK_COND(check_memory_layout_is(
+      out, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED));
+}
+
+void add_max_pool2d_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef kernel,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef ceil_mode,
+    const ValueRef out) {
+  ValueRef arg = prepack_if_tensor_ref(graph, in);
+  vTensor& t_in = graph.get_val(arg).toTensor();
+
+  const auto& out_val = graph.get_val(out).toValueList();
+  vTensor& t_out = graph.get_val(out_val[0]).toTensor();
+
+  check_max_pool2d_args(t_in, t_out);
+
+  api::utils::uvec3 global_size = t_out.virtual_extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::stringstream kernel_name;
+  kernel_name << "max_pool2d";
+  apply_dtype_suffix(kernel_name, t_out);
+
+  KernelParams kernel_params{
+      normalize_wh(graph.get_val(kernel)),
+      normalize_wh(graph.get_val(stride)),
+      normalize_wh(graph.get_val(padding)),
+      normalize_wh(graph.get_val(dilation)),
+  };
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name.str()),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{{out_val[0], out_val[1]}, api::MemoryAccessType::WRITE},
+       {arg, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          t_out.extents_ubo(),
+          t_in.extents_ubo(),
+          graph.create_params_buffer(kernel_params),
+      },
+      // Resizing
+      resize_max_pool2d_node,
+      {kernel, stride, padding, dilation, ceil_mode}));
+}
+
+void max_pool2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_max_pool2d_node(
+      graph, args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.max_pool2d_with_indices.default, max_pool2d);
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+struct KernelParams final {
+  api::utils::ivec2 kernel;
+  api::utils::ivec2 stride;
+  api::utils::ivec2 padding;
+  api::utils::ivec2 dilation;
+};
+
+int64_t calc_out_size(
+    const int64_t in_size,
+    const int64_t kernel,
+    const int64_t stride,
+    const int64_t padding,
+    const int64_t dilation,
+    const bool ceil_mode) {
+  int64_t c = ceil_mode ? stride - 1 : 0;
+  int64_t out_size =
+      (in_size + 2 * padding - dilation * (kernel - 1) - 1 + c) / stride + 1;
+  if (ceil_mode && (out_size - 1) * stride >= in_size + padding) {
+    --out_size;
+  }
+  return out_size;
+}
+
+api::utils::ivec2 normalize_wh(Value& v) {
+  if (v.isInt()) {
+    return api::utils::make_ivec2({v.toInt(), v.toInt()});
+  } else {
+    auto l = v.toIntList();
+    return api::utils::make_ivec2({l.at(1), l.at(0)});
+  }
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -172,9 +172,17 @@ void fill_vtensor(vTensor& vten, std::vector<float>& data) {
   }
 }
 
-void fill_vtensor(ComputeGraph& graph, const IOValueRef idx, float val) {
+void fill_vtensor(
+    ComputeGraph& graph,
+    const IOValueRef idx,
+    float val,
+    bool iota) {
   std::vector<float> data(graph.get_val(idx.value).toTensor().gpu_numel());
-  std::fill(data.begin(), data.end(), val);
+  if (iota) {
+    std::iota(data.begin(), data.end(), val);
+  } else {
+    std::fill(data.begin(), data.end(), val);
+  }
 
   graph.copy_into_staging(idx.staging, data.data(), data.size());
 }
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
@@ -118,7 +118,11 @@ inline void fill_vtensor(vTensor& vten, float val) {
   fill_vtensor(vten, vten_data);
 }
 
-void fill_vtensor(ComputeGraph& graph, const IOValueRef idx, float val);
+void fill_vtensor(
+    ComputeGraph& graph,
+    const IOValueRef idx,
+    float val,
+    bool iota = false);
 
 void extract_vtensor(vTensor& vten, std::vector<float>& data);
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,8 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:`
`41`	`41`	`exir_ops.edge.aten.relu.default,`
`42`	`42`	`# Matrix multiplication operators`
`43`	`43`	`exir_ops.edge.aten.mm.default,`
	`44`	`+ # Pooling operators`
	`45`	`+ exir_ops.edge.aten.max_pool2d_with_indices.default,`
`44`	`46`	`# Other`
`45`	`47`	`operator.getitem,`
`46`	`48`	`]`
Original file line number	Diff line number	Diff line change
`@@ -172,9 +172,17 @@ void fill_vtensor(vTensor& vten, std::vector<float>& data) {`
`172`	`172`	`}`
`173`	`173`	`}`
`174`	`174`
`175`		`-void fill_vtensor(ComputeGraph& graph, const IOValueRef idx, float val) {`
	`175`	`+void fill_vtensor(`
	`176`	`+ ComputeGraph& graph,`
	`177`	`+ const IOValueRef idx,`
	`178`	`+ float val,`
	`179`	`+ bool iota) {`
`176`	`180`	`std::vector<float> data(graph.get_val(idx.value).toTensor().gpu_numel());`
`177`		`- std::fill(data.begin(), data.end(), val);`
	`181`	`+ if (iota) {`
	`182`	`+ std::iota(data.begin(), data.end(), val);`
	`183`	`+ } else {`
	`184`	`+ std::fill(data.begin(), data.end(), val);`
	`185`	`+ }`
`178`	`186`
`179`	`187`	`graph.copy_into_staging(idx.staging, data.data(), data.size());`
`180`	`188`	`}`