pytorch · SS-JIA · Aug 2, 2024
@@ -319,24 +319,20 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   return image_extents_of(idx);
 }
 
-utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
+utils::uvec3 ComputeGraph::create_local_wg_size(
+    const utils::uvec3 global_wg_size) {
   if (config_.enable_local_wg_size_override) {
     return config_.local_wg_size_override;
   }
 
-  if (is_buffer_storage(idx)) {
-    return {64u, 1u, 1u};
-  }
-
-  const utils::uvec3 image_extents = image_extents_of(idx);
   utils::uvec3 local_group_size = {4, 4, 4};
 
-  if (image_extents.data[2u] == 1) {
-    if (image_extents.data[1u] == 1) {
+  if (global_wg_size.data[2u] == 1) {
+    if (global_wg_size.data[1u] == 1) {
       local_group_size.data[0u] = 64;
       local_group_size.data[1u] = 1;
       local_group_size.data[2u] = 1;
-    } else if (image_extents.data[1u] < 8) {
+    } else if (global_wg_size.data[1u] < 8) {
       local_group_size.data[0u] = 16;
       local_group_size.data[1u] = 4;
       local_group_size.data[2u] = 1;
@@ -349,6 +345,10 @@ utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
   return local_group_size;
 }
 
+utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
+  return create_local_wg_size(image_extents_of(idx));
+}
+
 void ComputeGraph::copy_into_staging(
     const ValueRef idx,
     const void* data,

@@ -180,7 +180,9 @@ class ComputeGraph final {
     return values_.at(idx).type();
   }
 
-  // Get Tensor Property
+  //
+  // Tensor Properties Accessors
+  //
 
   std::vector<int64_t> sizes_of(const ValueRef idx) const;
 
@@ -226,7 +228,9 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().ntexels_ubo();
   }
 
+  //
   // Scalar Value Extraction
+  //
 
   template <typename T>
   T extract_scalar(const ValueRef idx) {
@@ -459,16 +463,21 @@ class ComputeGraph final {
   utils::uvec3 create_global_wg_size(const ValueRef idx);
 
   /*
-   * Suggest a local workgroup size for a given `api::vTensor` value, assuming
-   * that every shader invocation calculates one texel element of the output
-   * tensor.
+   * Suggest a local workgroup size for a given global workgroup size.
    *
    * The local workgroup size will be formed to try and minimize the number of
    * inactive invocations.
    *
    * Currently, the local workgroup size is hard-coded to contain a total of 64
    * shader invocations. In the future, this value can be configured.
    */
+  utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size);
+
+  /*
+   * Convenience function to suggest a local workgroup size for a given
+   * `api::vTensor` value, assuming that every shader invocation calculates one
+   * texel element of the output tensor.
+   */
   utils::uvec3 create_local_wg_size(const ValueRef idx);
 
   //
@@ -500,6 +509,17 @@ class ComputeGraph final {
   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
   void propagate_resize();
 
+  //
+  // Miscellaneous Utilities
+  //
+
+  /*
+   * Check whether the GPU supports 8 bit buffers.
+   */
+  inline bool int8_buffers_enabled() const {
+    return context_->adapter_ptr()->has_full_int8_buffers_support();
+  }
+
   //
   // Debug support (implemented in Logging.cpp)
   //

@@ -80,7 +80,7 @@ ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
  * Returns: The (x, y, z, n) texel position corresponding to the first element
  *          of the texel at the specified buffer index
  */
-ivec4 to_texel_pos(int buf_i, ivec4 strides, int packed_dim) {
+ivec4 to_tensor_idx(int buf_i, ivec4 strides, int packed_dim) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
     if (i != packed_dim) {

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+#extension GL_EXT_control_flow_attributes : require
+
+${layout_declare_tensor(0, "r", "t_in", "int8", "texture3d")}
+${layout_declare_buffer(1, "w", "nchw_out", "int")}
+${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+${layout_declare_ubo(3, "int", "out_ntexels")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const int out_buf_idx = int(gl_GlobalInvocationID.x);
+  if (out_buf_idx >= out_ntexels) {
+    return;
+  }
+
+  ivec4 values;
+  int in_buf_idx = 4 * out_buf_idx;
+
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    const ivec4 tensor_idx = from_nchw_buffer_i(in_buf_idx, tensor_sizes);
+    const ivec4 texture_pos = to_texture_elem_pos(
+        tensor_idx, tensor_sizes, packed_dim);
+    values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w];
+    in_buf_idx++;
+  }
+
+  // Manually pack 4x 8-bit integers into a 32 bit integer. Note that little
+  // endian is assumed, since most processors use little endian. Thus the
+  // "later" values are placed in most significant bytes.
+  int packed = ((values[3] & 0xFF) << 24)
+             | ((values[2] & 0xFF) << 16)
+             | ((values[1] & 0xFF) << 8)
+             | ((values[0] & 0xFF));
+
+  nchw_out[out_buf_idx] = packed;
+}
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+#extension GL_EXT_control_flow_attributes : require
+
+${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
+${layout_declare_buffer(1, "r", "nchw_in", "int")}
+${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+/*
+ * Extends sign of int8
+ */
+int extend_sign(int x) {
+  if (x >> 7 == 1) {
+    return x | 0xFFFFFF00;
+  }
+  return x;
+}
+
+ivec4 read_texel(ivec4 tensor_idx) {
+  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
+      tensor_idx, tensor_sizes, packed_dim);
+
+  int shift = (1 << 8) - 1;
+  ivec4 masks;
+  // Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
+  // little endian is assumed, as most processors use little endian. Thus the
+  // most significant bytes correspond to the "latter" packed values.
+  masks.x = shift << (8 * (buf_indices.x % 4));
+  masks.y = shift << (8 * (buf_indices.y % 4));
+  masks.z = shift << (8 * (buf_indices.z % 4));
+  masks.w = shift << (8 * (buf_indices.w % 4));
+
+  ivec4 out_tex = ivec4(0);
+
+  [[unroll]] for (int i = 0; i < 4; ++i) {
+    if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
+      int in_texel = nchw_in[buf_indices[i] / 4];
+      int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
+      extracted_val = extend_sign(extracted_val);
+      out_tex[i] = extracted_val;
+    }
+  }
+
+  return out_tex;
+}
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);
+
+  if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
+    return;
+  }
+
+  write_texel(t_out, pos, read_texel(tensor_idx));
+}
@@ -62,7 +62,7 @@ void main() {
     return;
   }
 
-  ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
+  ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
   tensor_idx[packed_dim] *= 4;
   t_out[t_id] = read_texel(tensor_idx);
 }

@@ -53,7 +53,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_pos = to_texel_pos(t_id, out_strides, 0);
+  const ivec4 out_pos = to_tensor_idx(t_id, out_strides, 0);
 
   VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x);
   write_texel(t_out, t_id, outtex);

@@ -61,7 +61,7 @@ void main() {
   }
 
   const VEC4_T intex = t_in[t_id];
-  ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
+  ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
   tensor_idx[packed_dim] *= 4;
   write_out_texel(intex, tensor_idx);
 }

@@ -21,8 +21,8 @@ void add_staging_to_tensor_node(
     const ValueRef out_tensor) {
   VK_CHECK_COND(graph.val_is_staging(in_staging));
 
-  vkapi::ShaderInfo shader =
-      get_nchw_to_tensor_shader(*graph.get_tensor(out_tensor));
+  vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
+      *graph.get_tensor(out_tensor), graph.int8_buffers_enabled());
 
   vkapi::ParamsBindList ubos({graph.sizes_ubo(out_tensor)});
   if (graph.is_buffer_storage(out_tensor)) {
@@ -55,10 +55,26 @@ void add_tensor_to_staging_node(
     const ValueRef out_staging) {
   VK_CHECK_COND(graph.val_is_staging(out_staging));
 
-  vkapi::ShaderInfo shader =
-      get_tensor_to_nchw_shader(*graph.get_tensor(in_tensor));
+  vkapi::ShaderInfo shader = get_tensor_to_nchw_shader(
+      *graph.get_tensor(in_tensor), graph.int8_buffers_enabled());
 
+  utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
   vkapi::ParamsBindList ubos({graph.sizes_ubo(in_tensor)});
+
+  // Normally, the tensor_to_nchw shader is structured so that each thread reads
+  // one texel from the input texture and writes each component of the texel
+  // into the corresponding location in the output buffer. However, this shader
+  // is structured slightly differently in that each thread writes out a
+  // complete 32 bit integer (containing 4 packed 8-bit integers) into the
+  // output buffer. Therefore, the global work group size for this shader will
+  // be the number of elements in the output buffer divided by 4, as opposed to
+  // the extents of the input texture.
+  if (shader.kernel_name == "int8_tensor_to_nchw_noint8") {
+    uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
+    global_wg_size = {buffer_len, 1, 1};
+    ubos.append({graph.ntexels_ubo(in_tensor)});
+  }
+
   if (graph.is_buffer_storage(in_tensor)) {
     ubos.append({
         graph.texel_strides_ubo(in_tensor),
@@ -69,8 +85,8 @@ void add_tensor_to_staging_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       shader,
-      graph.create_global_wg_size(in_tensor),
-      graph.create_local_wg_size(in_tensor),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
       // Input and Outputs
       {{in_tensor, vkapi::MemoryAccessType::READ},
        {out_staging, vkapi::MemoryAccessType::WRITE}},
@@ -86,7 +102,8 @@ ValueRef prepack(
     const utils::GPUMemoryLayout layout) {
   ValueRef v = graph.add_tensor_like(vref, layout);
 
-  vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(*graph.get_tensor(v));
+  vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
+      *graph.get_tensor(v), graph.int8_buffers_enabled());
 
   vkapi::ParamsBindList ubos({graph.sizes_ubo(v)});
   if (graph.is_buffer_storage(v)) {

@@ -95,21 +95,35 @@ void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) {
   memset(data_ptr, 0, staging.nbytes());
 }
 
-vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst) {
+vkapi::ShaderInfo get_nchw_to_tensor_shader(
+    const api::vTensor& v_dst,
+    const bool int8_buffer_enabled) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
 
+  if (v_dst.dtype() == vkapi::kChar &&
+      v_dst.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
+    return VK_KERNEL(nchw_to_int8_tensor_noint8);
+  }
+
   kernel_name = "nchw_to_tensor";
   add_dtype_suffix(kernel_name, v_dst);
   add_storage_type_suffix(kernel_name, v_dst);
 
   return VK_KERNEL_FROM_STR(kernel_name);
 }
 
-vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src) {
+vkapi::ShaderInfo get_tensor_to_nchw_shader(
+    const api::vTensor& v_src,
+    bool int8_buffer_enabled) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
 
+  if (v_src.dtype() == vkapi::kChar &&
+      v_src.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
+    return VK_KERNEL(int8_tensor_to_nchw_noint8);
+  }
+
   kernel_name = "tensor_to_nchw";
   add_dtype_suffix(kernel_name, v_src);
   add_storage_type_suffix(kernel_name, v_src);

@@ -31,7 +31,11 @@ void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes);
 // Functions to get shaders
 //
 
-vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst);
-vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src);
+vkapi::ShaderInfo get_nchw_to_tensor_shader(
+    const api::vTensor& v_dst,
+    bool int8_buffer_enabled = true);
+vkapi::ShaderInfo get_tensor_to_nchw_shader(
+    const api::vTensor& v_src,
+    bool int8_buffer_enabled = true);
 
 } // namespace vkcompute