[ET-VK][7/n] Slice, with lots of codegen improvements

yipjustin · yipjustin · commit bf9a9467d7d7 · 2024-04-19T13:58:38.000-07:00
Pull Request resolved: #3171 1. Add slice operation. Instead of using copy in LI, we implement a simple shader with offsets. 2. Improvement in codegen. - add support of optional variables - improve indent of the code, for better readability - allow user to specify tensor value generation, possible to generate sequential values for easier debugging for index operations - sample code improve test-case specification, particularly with long and optional values. ghstack-source-id: 223254861 Differential Revision: [D56295985](https://our.internmc.facebook.com/intern/diff/D56295985/)
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -10,6 +10,8 @@
 
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
+#include <optional>
+
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
 #include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
@@ -184,6 +186,15 @@ class ComputeGraph final {
     VK_THROW("Cannot extract scalar from Value with type ", value.type());
   }
 
+  template <typename T>
+  std::optional<T> extract_optional_scalar(const ValueRef idx) {
+    if (val_is_none(idx)) {
+      return ::std::nullopt;
+    } else {
+      return extract_scalar<T>(idx);
+    }
+  }
+
   inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
     return prepack_nodes_;
   }
diff --git a/backends/vulkan/runtime/graph/Logging.h b/backends/vulkan/runtime/graph/Logging.h
@@ -10,6 +10,7 @@
 
 #include <executorch/backends/vulkan/runtime/api/Utils.h>
 
+#include <optional>
 #include <ostream>
 #include <vector>
 
@@ -33,4 +34,14 @@ inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec4& v) {
   return api::utils::operator<<(os, v);
 }
 
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::optional<T>& opt) {
+  os << "[";
+  if (opt) {
+    os << opt.value();
+  }
+  os << "]";
+  return os;
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -8,15 +8,15 @@
 
 #define divup4(x) ((x + 3) / 4)
 
-// Input: idx is a ivec4 user-level coordinate, sizes is the tensor shape
-// Output: buffer_idx in the continuous nchw-buffer.
+// Input: idx is a ivec4 user-level (w, h, c, n) coordinate, sizes is the tensor
+// shape Output: buffer_idx in the continuous nchw-buffer.
 #define to_buffer_i(idx, sizes)                          \
   (idx.x + idx.y * sizes.x + idx.z * sizes.y * sizes.x + \
    idx.w * sizes.z * sizes.y * sizes.x)
 
 // Inverse of to_buffer_i
 // Input: buffer_idx in the continuous nchw-buffer, sizes is the tensor shape
-// Output: ivec4 user-level coorindate
+// Output: ivec4 user-level (w, h, c, n) coorindate
 #define from_buffer_i(buf_i, sizes)            \
   ivec4(                                       \
       buf_i % sizes.x,                         \
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.glsl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+  uvec4 data;
+}
+out_sizes;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg {
+  int dim;
+  int offset;
+  int step;
+  // Used when dim=batch. Stride is the # of plances for each batch  value.
+  int stride;  
+}
+slice_arg;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+  
+  const ivec4 idx = to_tensor_idx_C_packed(out_pos, out_sizes.data);
+
+  if (any(greaterThanEqual(idx, out_sizes.data))) {
+    return;
+  }
+  
+  ivec3 in_pos = out_pos;
+
+  int index = out_pos[slice_arg.dim] / slice_arg.stride;
+  int within_stride = out_pos[slice_arg.dim] % slice_arg.stride;
+
+  in_pos[slice_arg.dim] = slice_arg.offset * slice_arg.stride + index * slice_arg.step *
+    slice_arg.stride + within_stride;
+
+  imageStore(image_out, out_pos, texelFetch(image_in, in_pos, 0));
+
+}
+
+
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_batch_height_width.yaml
@@ -0,0 +1,10 @@
+slice_batch_height_width:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: slice_batch_height_width
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+
+#define to_tensor_idx to_tensor_idx_${PACKING}
+#define to_texture_pos_elem to_texture_pos_elem_${PACKING}
+#define get_packed_stride get_packed_stride_${PACKING}
+
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
+  uvec4 data;
+}
+out_sizes;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict OutCpuSizes {
+  uvec4 out_cpu_sizes;
+};
+
+layout(set = 0, binding = 4) uniform PRECISION restrict InGpuSizes {
+  uvec4 in_gpu_sizes;
+};
+
+layout(set = 0, binding = 5) uniform PRECISION restrict SliceArg {
+  int offset;
+  int step;
+}
+slice_arg;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+  
+  const ivec4 idx = to_tensor_idx_C_packed(out_pos, out_sizes.data);
+
+  if (any(greaterThanEqual(idx, out_sizes.data))) {
+    return;
+  }
+
+  // We map the output pos using the buffer index.  For each index in the texel,
+  // we calculate the source whcn-coordinate amended with offset-ed channel
+  // value.  Then we calculate the actual texture position from the
+  // whcn-coordinate.
+
+  const uint base_index = to_buffer_i(idx, out_cpu_sizes);
+  uvec4 buf_indices =
+    base_index + ivec4(0, 1, 2, 3) * get_packed_stride(out_cpu_sizes);
+ 
+  vec4 outex;
+  for (int i=0;i<4;i++) {
+      ivec4 user_coor = from_buffer_i(buf_indices[i], out_cpu_sizes);
+ 
+      int in_channel = user_coor.z;
+
+      ivec4 in_user_coor = user_coor;
+      in_user_coor.z = slice_arg.offset + in_channel * slice_arg.step;
+
+      ivec4 in_pow_elem = to_texture_pos_elem_C_packed(
+        in_user_coor,
+        in_gpu_sizes);
+
+      vec4 v = texelFetch(image_in, in_pow_elem.xyz, 0);
+
+      outex[i] = v[in_pow_elem.w];
+  }
+  imageStore(image_out, out_pos, outex);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml
@@ -0,0 +1,11 @@
+slice_channel:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+    PACKING:
+      - VALUE: C_packed
+  shader_variants:
+    - NAME: slice_channel
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_slice_tensor_out_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef step_ref,
+    ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
+
+  // Need normalize the dim
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+
+  VK_CHECK_COND(
+      -t_in->dim() <= dim && dim < t_in->dim(),
+      "dim must be in range of [-self.dim(), self.dim()), but current dim's value is ",
+      dim,
+      " and self.dim() = ",
+      t_in->dim());
+
+  dim = normalize(dim, t_in->dim());
+
+  // Create a dim value as in the underlying dim is 4-dimension.
+  int64_t nchw_dim = dim + (4 - t_in->dim());
+
+  std::optional<int64_t> opt_start =
+      graph.extract_optional_scalar<int64_t>(opt_start_ref);
+  std::optional<int64_t> opt_end =
+      graph.extract_optional_scalar<int64_t>(opt_end_ref);
+  int64_t step = graph.extract_scalar<int64_t>(step_ref);
+
+  const auto in_sizes = t_in->sizes();
+  const auto out_sizes = t_out->sizes();
+
+  int64_t start = opt_start.value_or(0);
+  int64_t end = opt_end.value_or(in_sizes[dim]);
+
+  VK_CHECK_COND((0 <= start) && (start < in_sizes[dim]));
+  VK_CHECK_COND((0 <= end) && (end <= in_sizes[dim]));
+
+  if (nchw_dim == 1) {
+    // slice by channel
+    std::string kernel_name = "slice_channel";
+    kernel_name.reserve(kShaderNameReserve);
+    add_dtype_suffix(kernel_name, *t_out);
+    add_memory_layout_suffix(kernel_name, *t_out);
+
+    api::utils::uvec3 global_size = t_out->extents();
+    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+    const struct Block final {
+      int offset;
+      int step;
+    } params{
+        static_cast<int32_t>(start),
+        static_cast<int32_t>(step),
+    };
+
+    graph.execute_nodes().emplace_back(new ExecuteNode(
+        graph,
+        VK_KERNEL_FROM_STR(kernel_name),
+        global_size,
+        local_size,
+        {{out, api::MemoryAccessType::WRITE},
+         {in, api::MemoryAccessType::READ}},
+        {t_out->gpu_sizes_ubo(),
+         t_out->cpu_sizes_ubo(),
+         t_in->gpu_sizes_ubo(),
+         graph.create_params_buffer(params)}));
+
+  } else {
+    // GPU's coordinate is in x, y, z
+    int64_t gpu_dim = -1;
+    int64_t stride = 1;
+    if (nchw_dim == 3) {
+      gpu_dim = 0; // width: x dimension in gpu
+      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
+    } else if (nchw_dim == 2) {
+      gpu_dim = 1; // height: y dimension
+      VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
+    } else if (nchw_dim == 0) {
+      gpu_dim = 2; // batch: z dimension
+
+      // Due to channel packing, each batch value is span over stride planes
+      int64_t n_channels = dim_at<Dim4D::Channel>(in_sizes);
+      stride = api::utils::div_up<int64_t>(n_channels, 4ll);
+    } else {
+      VK_THROW("Unexpected ncwh_dim!");
+    }
+
+    std::string kernel_name = "slice_batch_height_width";
+    kernel_name.reserve(kShaderNameReserve);
+    add_dtype_suffix(kernel_name, *t_out);
+
+    api::utils::uvec3 global_size = t_out->extents();
+    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+    const struct Block final {
+      int dim;
+      int offset;
+      int step;
+      int stride;
+    } params{
+        static_cast<int32_t>(gpu_dim),
+        static_cast<int32_t>(start),
+        static_cast<int32_t>(step),
+        static_cast<int32_t>(stride),
+    };
+
+    graph.execute_nodes().emplace_back(new ExecuteNode(
+        graph,
+        VK_KERNEL_FROM_STR(kernel_name),
+        global_size,
+        local_size,
+        {{out, api::MemoryAccessType::WRITE},
+         {in, api::MemoryAccessType::READ}},
+        {t_out->gpu_sizes_ubo(), graph.create_params_buffer(params)}));
+  }
+}
+
+void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_slice_tensor_out_node(
+      graph,
+      args[0],
+      args[1], // dim
+      args[2], // optional start
+      args[3], // optional end
+      args[4], // step
+      args[5]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_out);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py