Support int8 texture tensors without requiring int8 buffers

SS-JIA · facebook-github-bot · commit ad47a9a6c687 · 2024-07-31T13:10:58.000-07:00
Summary: ## Context By default, storage buffers in Vulkan must contain 32 bit data types; using 8 bit and 16 bit data types in buffers can be enabled optionally by supporting the [VK_KHR_8bit_storage](https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_KHR_8bit_storage.html) extension or the [VK_KHR_16bit_storage](https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_KHR_16bit_storage.html) extension respectively. Previously, 8-bit and 16-bit tensors were enabled by using those extensions; however, this meant that 8-bit and 16-bit tensors could not be used if the Vulkan driver does not support the corresponding extension. This diff adds support for 8-bit texture-backed tensors without the need for the VK_KHR_8bit_storage extension. This is done by introducing shaders that manually pack and repack 4 8-bit integers into a single int32 value. Once the tensor data has been transferred to an image texture (which will use the `VK_FORMAT_R8G8B8A8_SINT` image format) the extension will no longer be required. Differential Revision: D60536832
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -319,24 +319,20 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   return image_extents_of(idx);
 }
 
-utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
+utils::uvec3 ComputeGraph::create_local_wg_size(
+    const utils::uvec3 global_wg_size) {
   if (config_.enable_local_wg_size_override) {
     return config_.local_wg_size_override;
   }
 
-  if (is_buffer_storage(idx)) {
-    return {64u, 1u, 1u};
-  }
-
-  const utils::uvec3 image_extents = image_extents_of(idx);
   utils::uvec3 local_group_size = {4, 4, 4};
 
-  if (image_extents.data[2u] == 1) {
-    if (image_extents.data[1u] == 1) {
+  if (global_wg_size.data[2u] == 1) {
+    if (global_wg_size.data[1u] == 1) {
       local_group_size.data[0u] = 64;
       local_group_size.data[1u] = 1;
       local_group_size.data[2u] = 1;
-    } else if (image_extents.data[1u] < 8) {
+    } else if (global_wg_size.data[1u] < 8) {
       local_group_size.data[0u] = 16;
       local_group_size.data[1u] = 4;
       local_group_size.data[2u] = 1;
@@ -349,6 +345,10 @@ utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
   return local_group_size;
 }
 
+utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
+  return create_local_wg_size(image_extents_of(idx));
+}
+
 void ComputeGraph::copy_into_staging(
     const ValueRef idx,
     const void* data,
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -180,7 +180,9 @@ class ComputeGraph final {
     return values_.at(idx).type();
   }
 
-  // Get Tensor Property
+  //
+  // Get Tensor Properties
+  //
 
   std::vector<int64_t> sizes_of(const ValueRef idx) const;
 
@@ -226,7 +228,9 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().ntexels_ubo();
   }
 
+  //
   // Scalar Value Extraction
+  //
 
   template <typename T>
   T extract_scalar(const ValueRef idx) {
@@ -459,16 +463,21 @@ class ComputeGraph final {
   utils::uvec3 create_global_wg_size(const ValueRef idx);
 
   /*
-   * Suggest a local workgroup size for a given `api::vTensor` value, assuming
-   * that every shader invocation calculates one texel element of the output
-   * tensor.
+   * Suggest a local workgroup size for a given global workgroup size.
    *
    * The local workgroup size will be formed to try and minimize the number of
    * inactive invocations.
    *
    * Currently, the local workgroup size is hard-coded to contain a total of 64
    * shader invocations. In the future, this value can be configured.
    */
+  utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size);
+
+  /*
+   * Convenience function to suggest a local workgroup size for a given
+   * `api::vTensor` value, assuming that every shader invocation calculates one
+   * texel element of the output tensor.
+   */
   utils::uvec3 create_local_wg_size(const ValueRef idx);
 
   //
@@ -500,6 +509,17 @@ class ComputeGraph final {
   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
   void propagate_resize();
 
+  //
+  // Miscellaneous Utilities
+  //
+
+  /*
+   * Check whether the GPU supports 8 bit buffers.
+   */
+  inline bool int8_buffers_enabled() const {
+    return context_->adapter_ptr()->has_full_int8_buffers_support();
+  }
+
   //
   // Debug support (implemented in Logging.cpp)
   //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -80,7 +80,7 @@ ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
  * Returns: The (x, y, z, n) texel position corresponding to the first element
  *          of the texel at the specified buffer index
  */
-ivec4 to_texel_pos(int buf_i, ivec4 strides, int packed_dim) {
+ivec4 to_tensor_idx(int buf_i, ivec4 strides, int packed_dim) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
     if (i != packed_dim) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_tensor_to_nchw_noint8.glsl
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+#extension GL_EXT_control_flow_attributes : require
+
+${layout_declare_tensor(0, "r", "t_in", "int8", "texture3d")}
+${layout_declare_buffer(1, "w", "nchw_out", "int")}
+${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const int out_buf_idx = int(gl_GlobalInvocationID.x);
+  int in_buf_idx = 4 * out_buf_idx;
+
+  ivec4 values;
+
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    const ivec4 tensor_idx = from_nchw_buffer_i(in_buf_idx, tensor_sizes);
+    const ivec4 texture_pos = to_texture_elem_pos(
+        tensor_idx, tensor_sizes, packed_dim);
+    values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w];
+    in_buf_idx++;
+  }
+
+  // Manually pack 4x 8-bit integers into a 32 bit integer. Note that little
+  // endian is assumed, since most processors use little endian. Thus the
+  // "later" values are placed in most significant bytes.
+  int packed = ((values[3] & 0xFF) << 24)
+             | ((values[2] & 0xFF) << 16)
+             | ((values[1] & 0xFF) << 8)
+             | ((values[0] & 0xFF));
+
+  nchw_out[out_buf_idx] = packed;
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_tensor_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_tensor_noint8.glsl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+#extension GL_EXT_control_flow_attributes : require
+
+${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
+${layout_declare_buffer(1, "r", "nchw_in", "int")}
+${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+/*
+ * Extends sign of int8
+ */
+int extend_sign(int x) {
+  if (x >> 7 == 1) {
+    return x | 0xFFFFFF00;
+  }
+  return x;
+}
+
+ivec4 read_texel(ivec4 tensor_idx) {
+  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
+      tensor_idx, tensor_sizes, packed_dim);
+
+  int shift = (1 << 8) - 1;
+  ivec4 masks;
+  // Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
+  // little endian is assumed, as most processors use little endian. Thus the
+  // most significant bytes correspond to the "latter" packed values.
+  masks.x = shift << (8 * (buf_indices.x % 4));
+  masks.y = shift << (8 * (buf_indices.y % 4));
+  masks.z = shift << (8 * (buf_indices.z % 4));
+  masks.w = shift << (8 * (buf_indices.w % 4));
+
+  ivec4 out_tex = ivec4(0);
+
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
+      int in_texel = nchw_in[buf_indices[i] / 4];
+      int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
+      extracted_val = extend_sign(extracted_val);
+      out_tex[i] = extracted_val;
+    }
+  }
+
+  return out_tex;
+}
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);
+
+  if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
+    return;
+  }
+
+  write_texel(t_out, pos, read_texel(tensor_idx));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_tensor.glsl
@@ -62,7 +62,7 @@ void main() {
     return;
   }
 
-  ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
+  ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
   tensor_idx[packed_dim] *= 4;
   t_out[t_id] = read_texel(tensor_idx);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -53,7 +53,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_pos = to_texel_pos(t_id, out_strides, 0);
+  const ivec4 out_pos = to_tensor_idx(t_id, out_strides, 0);
 
   VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x);
   write_texel(t_out, t_id, outtex);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/tensor_to_nchw.glsl
@@ -61,7 +61,7 @@ void main() {
   }
 
   const VEC4_T intex = t_in[t_id];
-  ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
+  ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
   tensor_idx[packed_dim] *= 4;
   write_out_texel(intex, tensor_idx);
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -24,6 +24,12 @@ void add_staging_to_tensor_node(
   vkapi::ShaderInfo shader =
       get_nchw_to_tensor_shader(*graph.get_tensor(out_tensor));
 
+  if (graph.dtype_of(out_tensor) == vkapi::kChar &&
+      graph.storage_type_of(out_tensor) == utils::kTexture3D &&
+      !graph.int8_buffers_enabled()) {
+    shader = VK_KERNEL(nchw_to_int8_tensor_noint8);
+  }
+
   vkapi::ParamsBindList ubos({graph.sizes_ubo(out_tensor)});
   if (graph.is_buffer_storage(out_tensor)) {
     ubos.append({
@@ -58,6 +64,15 @@ void add_tensor_to_staging_node(
   vkapi::ShaderInfo shader =
       get_tensor_to_nchw_shader(*graph.get_tensor(in_tensor));
 
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
+
+  if (graph.dtype_of(in_tensor) == vkapi::kChar &&
+      !graph.is_buffer_storage(in_tensor) && !graph.int8_buffers_enabled()) {
+    shader = VK_KERNEL(int8_tensor_to_nchw_noint8);
+    uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
+    global_wg_size = {buffer_len, 1, 1};
+  }
+
   vkapi::ParamsBindList ubos({graph.sizes_ubo(in_tensor)});
   if (graph.is_buffer_storage(in_tensor)) {
     ubos.append({
@@ -69,8 +84,8 @@ void add_tensor_to_staging_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       shader,
-      graph.create_global_wg_size(in_tensor),
-      graph.create_local_wg_size(in_tensor),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
       // Input and Outputs
       {{in_tensor, vkapi::MemoryAccessType::READ},
        {out_staging, vkapi::MemoryAccessType::WRITE}},
diff --git a/backends/vulkan/test/glsl/all_shaders.yaml b/backends/vulkan/test/glsl/all_shaders.yaml
@@ -47,21 +47,12 @@ idx_fill_buffer:
 idx_fill_texture:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    PACKING: CHANNELS_PACKED
   generate_variant_forall:
-    PACKING:
-      - VALUE: "CHANNELS_PACKED"
-        SUFFIX: "C_packed"
-      - VALUE: "WIDTH_PACKED"
-        SUFFIX: "W_packed"
-      - VALUE: "HEIGHT_PACKED"
-        SUFFIX: "H_packed"
     DTYPE:
-      - VALUE: "half"
-        SUFFIX: "half"
-      - VALUE: "float"
-        SUFFIX: "float"
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+      - VALUE: int8
   shader_variants:
     - NAME: idx_fill_texture
 
diff --git a/backends/vulkan/test/glsl/idx_fill_texture.glsl b/backends/vulkan/test/glsl/idx_fill_texture.glsl
@@ -12,21 +12,17 @@
 
 #define VEC4_T ${texel_type(DTYPE)}
 
-#define POS ${get_pos[NDIM]("pos")}
-
 #include "indexing_utils.h"
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-
-layout(set = 0, binding = 1) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
+${layout_declare_tensor(0, "w", "image_out", DTYPE, "texture3d")}
+${layout_declare_ubo(1, "ivec4", "sizes")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
+layout(constant_id = 4) const int offset = 10;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -37,6 +33,6 @@ void main() {
   }
 
   const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
-  VEC4_T texel = VEC4_T(buf_indices);
-  imageStore(image_out, POS, texel);
+  VEC4_T texel = VEC4_T(buf_indices) + offset;
+  imageStore(image_out, pos, texel);
 }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ void main() {`
`62`	`62`	`return;`
`63`	`63`	`}`
`64`	`64`
`65`		`- ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);`
	`65`	`+ ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);`
`66`	`66`	`tensor_idx[packed_dim] *= 4;`
`67`	`67`	`t_out[t_id] = read_texel(tensor_idx);`
`68`	`68`	`}`
Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ void main() {`
`53`	`53`	`return;`
`54`	`54`	`}`
`55`	`55`
`56`		`- const ivec4 out_pos = to_texel_pos(t_id, out_strides, 0);`
	`56`	`+ const ivec4 out_pos = to_tensor_idx(t_id, out_strides, 0);`
`57`	`57`
`58`	`58`	`VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x);`
`59`	`59`	`write_texel(t_out, t_id, outtex);`
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ void main() {`
`61`	`61`	`}`
`62`	`62`
`63`	`63`	`const VEC4_T intex = t_in[t_id];`
`64`		`- ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);`
	`64`	`+ ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);`
`65`	`65`	`tensor_idx[packed_dim] *= 4;`
`66`	`66`	`write_out_texel(intex, tensor_idx);`
`67`	`67`	`}`