aten.select.int

yipjustin · facebook-github-bot · commit 48e05ed50f69 · 2024-04-12T20:11:38.000-07:00
Summary:
Port over the `select.int` shaders to ET.

1. Since in ET, tensor-shape reasoning happens in AOT, therefore we can simplify the c++ caller code by a lot.
2. In this diff, we also try to use the same buffer object for passing arguments to all shaders. Not worry about perf cost, since cost difference between passing int and ivec4 is very minor.

Differential Revision: D56082483
diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h
@@ -255,6 +255,14 @@ class vTensor final {
     return sizes_;
   }
 
+  inline const int64_t size(size_t dim) const {
+    return sizes_[dim];
+  }
+
+  inline const int64_t dim() const {
+    return sizes_.size();
+  }
+
   inline const std::vector<int64_t>& strides() const {
     return strides_;
   }
diff --git a/backends/vulkan/runtime/graph/Logging.h b/backends/vulkan/runtime/graph/Logging.h
@@ -10,6 +10,7 @@
 
 #include <ostream>
 #include <vector>
+#include <executorch/backends/vulkan/runtime/api/Utils.h>
 
 namespace vkcompute {
 
@@ -23,4 +24,9 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
   return os; // Return the ostream to allow chaining
 }
 
+
+inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec3& v) {
+    return api::utils::operator<<(os, v);
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
@@ -0,0 +1,37 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+// index to select
+layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+  // data.x: index along width dim to select
+  // data.y: number of batches
+  // data.z: number of texels per batch
+  // data.w: unused
+  ivec4 data;
+}
+select_info;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const int num_batches = select_info.data.y;
+  const int num_texel_per_batch = select_info.data.z;
+  const int index = select_info.data.x;
+
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  
+  const uint src_pos_z = (num_texel_per_batch * index) + pos.z;
+  imageStore(
+      image_out, pos, texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0));
+}
+
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml
@@ -0,0 +1,12 @@
+select_batch_4d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: select_batch_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_depth_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_depth_3d.glsl
@@ -0,0 +1,33 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+// index to select
+layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+  int data;
+}
+index;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos.xy, out_extents.data.xy))) {
+    const int tex = index.data / 4;
+    const int ind = index.data % 4;
+    const float v = texelFetch(image_in, ivec3(pos.x, pos.y, tex), 0)[ind];
+
+    imageStore(image_out, ivec3(pos.x, pos.y, 0), vec4(v, 0, 0, 0));
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_depth_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_depth_3d.yaml
@@ -0,0 +1,12 @@
+select_depth_3d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: select_depth_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_depth_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_depth_4d.glsl
@@ -0,0 +1,48 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+// index to select
+layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+  // data.x: index along width dim to select
+  // data.y: number of batches
+  // data.z: number of texels per batch
+  // data.w: unused
+  ivec4 data;
+}
+select_info;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const int num_batches = select_info.data.y;
+  const int num_texel_per_batch = select_info.data.z;
+  const int index = select_info.data.x;
+  
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  // read in the same channel from 4 separate batches
+  vec4 out_texel = vec4(0, 0, 0, 0);
+  for (int k = 0; k < 4; k++) {
+    if ((k + pos.z * 4) >=
+        num_batches) { 
+      break;
+    }
+    const uint src_pos_z = (4 * num_texel_per_batch * pos.z) +
+        (k * num_texel_per_batch) + (index / 4);
+    const uint src_pos_t = index % 4;
+    out_texel[k] =
+        texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0)[src_pos_t];
+  }
+
+  imageStore(image_out, pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_depth_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_depth_4d.yaml
@@ -0,0 +1,12 @@
+select_depth_4d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: select_depth_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
@@ -0,0 +1,45 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+// index to select
+layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+  int data;
+}
+index;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // w
+  const int src_x = pos.x;
+  // h
+  const int src_y = index.data;
+  // c
+  const int src_z = pos.y;
+
+  const vec4 v = texelFetch(image_in, ivec3(src_x, src_y, src_z), 0);
+
+  for (int i = 0; i < 4; i++) {
+    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
+
+    // When the C-channel exceeds original block size, exit early
+    if (new_pos.y >= out_extents.data.y) {
+      return;
+    }
+
+    imageStore(image_out, new_pos, vec4(v[i], 0, 0, 0));
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml
@@ -0,0 +1,12 @@
+select_height_3d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: select_height_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
@@ -0,0 +1,45 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+// index to select
+layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+  // data.x: index along width dim to select
+  // data.y: number of batches
+  // data.z: number of texels per batch
+  // data.w: unused
+  ivec4 data;
+}
+select_info;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const int num_batches = select_info.data.y;
+  const int num_texel_per_batch = select_info.data.z;
+  const int index = select_info.data.x;
+  
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  vec4 out_texel = vec4(0, 0, 0, 0);
+  // read in the same channel from 4 separate batches
+  for (int k = 0; k < 4; k++) {
+    if ((k + pos.z * 4) >= num_batches
+        ) { // < 4 batches for this texel, exit early
+      break;
+    }
+    const uint src_pos_z = (pos.z * num_texel_per_batch * 4) +
+        k * num_texel_per_batch + (pos.y / 4);
+    out_texel[k] = texelFetch(
+        image_in, ivec3(pos.x, index, src_pos_z), 0)[pos.y % 4];
+  }
+  imageStore(image_out, pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml
@@ -0,0 +1,12 @@
+select_height_4d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: select_height_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
@@ -0,0 +1,45 @@
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+// index to select
+layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
+  int data;
+}
+index;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // w
+  const int src_x = index.data;
+  // h
+  const int src_y = pos.x;
+  // c
+  const int src_z = pos.y;
+
+  const vec4 v = texelFetch(image_in, ivec3(src_x, src_y, src_z), 0);
+
+  for (int i = 0; i < 4; i++) {
+    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
+
+    // When the C-channel exceeds original block size, exit early
+    if (new_pos.y >= out_extents.data.y) {
+      return;
+    }
+
+    imageStore(image_out, new_pos, vec4(v[i], 0, 0, 0));
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml
@@ -0,0 +1,12 @@
+select_width_3d:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: select_width_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py

Original file line number	Diff line number	Diff line change
`@@ -255,6 +255,14 @@ class vTensor final {`
`255`	`255`	`return sizes_;`
`256`	`256`	`}`
`257`	`257`
	`258`	`+ inline const int64_t size(size_t dim) const {`
	`259`	`+ return sizes_[dim];`
	`260`	`+ }`
	`261`	`+`
	`262`	`+ inline const int64_t dim() const {`
	`263`	`+ return sizes_.size();`
	`264`	`+ }`
	`265`	`+`
`258`	`266`	`inline const std::vector<int64_t>& strides() const {`
`259`	`267`	`return strides_;`
`260`	`268`	`}`