pytorch · dbort · Oct 8, 2024
@@ -10,16 +10,14 @@
 
 #define PRECISION ${PRECISION}
 
-${define_active_storage_type(STORAGE)}
-
 #include "indexing_utils.h"
 
 layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
 ${layout_declare_buffer(B, "w", "nchw_out", "int")}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")}
 ${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
 ${layout_declare_ubo(B, "ivec4", "axis_map")}
 ${layout_declare_ubo(B, "int", "out_numel")}
@@ -46,7 +44,7 @@ void main() {
     const ivec4 tidx = nchwi_to_tidx(in_buf_idx, tensor_sizes);
     const ivec4 texture_pos = to_texture_elem_pos(
         tidx, tensor_sizes, packed_dim);
-    values[i] = ivec4(load_texel(t_in, texture_pos.xyz))[texture_pos.w];
+    values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w];
     in_buf_idx++;
   }
 

@@ -10,17 +10,13 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_active_storage_type(STORAGE)}
-
 #include "indexing_utils.h"
 
 layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")}
 ${layout_declare_buffer(B, "r", "nchw_in", "int")}
 ${layout_declare_ubo(B, "ivec4", "sizes")}
 ${layout_declare_ubo(B, "ivec4", "axis_map")}
@@ -75,5 +71,5 @@ void main() {
     return;
   }
 
-  write_texel(t_out, lpos_to_pos(lpos, axis_map), VEC4_T(read_texel(tidx)));
+  write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx));
 }
@@ -80,7 +80,7 @@ void add_tensor_to_staging_node(
   // output buffer. Therefore, the global work group size for this shader will
   // be the number of elements in the output buffer divided by 4, as opposed to
   // the extents of the input texture.
-  if (shader.kernel_name.starts_with("bitw8_image_to_nchw_nobitw8buffer")) {
+  if (shader.kernel_name == "int8_image_to_nchw_noint8") {
     uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
     global_wg_size = {buffer_len, 1, 1};
     ubos.append({graph.numel_ubo(in_tensor)});

@@ -15,23 +15,15 @@
 
 namespace vkcompute {
 
-bool is_bitw8(vkapi::ScalarType dtype) {
-  return dtype == vkapi::kByte || dtype == vkapi::kChar ||
-      dtype == vkapi::kQInt8 || dtype == vkapi::kQUInt8;
-}
-
 vkapi::ShaderInfo get_nchw_to_tensor_shader(
     const api::vTensor& v_dst,
     const bool int8_buffer_enabled) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
 
-  if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer &&
-      !int8_buffer_enabled) {
-    kernel_name = "nchw_to_bitw8_image_nobitw8buffer";
-    add_dtype_suffix(kernel_name, v_dst);
-    add_storage_type_suffix(kernel_name, v_dst);
-    return VK_KERNEL_FROM_STR(kernel_name);
+  if (v_dst.dtype() == vkapi::kChar &&
+      v_dst.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
+    return VK_KERNEL(nchw_to_int8_image_noint8);
   }
 
   if (v_dst.storage_type() == utils::kBuffer) {
@@ -53,12 +45,9 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
 
-  if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer &&
-      !int8_buffer_enabled) {
-    kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-    add_dtype_suffix(kernel_name, v_src);
-    add_storage_type_suffix(kernel_name, v_src);
-    return VK_KERNEL_FROM_STR(kernel_name);
+  if (v_src.dtype() == vkapi::kChar &&
+      v_src.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
+    return VK_KERNEL(int8_image_to_nchw_noint8);
   }
 
   if (v_src.storage_type() == utils::kBuffer) {

@@ -111,20 +111,15 @@ void record_image_to_nchw_op(
       v_src.axis_map_ubo());
 }
 
-void record_bitw8_image_to_nchw_nobitw8buffer_op(
+void record_int8_image_to_nchw_noint8_op(
     api::Context* const context,
     api::vTensor& v_src,
     api::StagingBuffer& dst_buffer) {
   vkapi::PipelineBarrier pipeline_barrier{};
   uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
   utils::uvec3 global_wg_size = {buffer_len, 1, 1};
-
-  std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-  add_dtype_suffix(kernel_name, v_src);
-  add_storage_type_suffix(kernel_name, v_src);
-
   context->submit_compute_job(
-      VK_KERNEL_FROM_STR(kernel_name),
+      VK_KERNEL(int8_image_to_nchw_noint8),
       pipeline_barrier,
       global_wg_size,
       adaptive_work_group_size(global_wg_size),

@@ -84,7 +84,7 @@ void record_image_to_nchw_op(
     vkcompute::api::vTensor& v_src,
     vkcompute::vkapi::VulkanBuffer& dst_buffer);
 
-void record_bitw8_image_to_nchw_nobitw8buffer_op(
+void record_int8_image_to_nchw_noint8_op(
     vkcompute::api::Context* const context,
     vkcompute::api::vTensor& v_src,
     vkcompute::api::StagingBuffer& dst_buffer);

@@ -2365,8 +2365,7 @@ void run_from_gpu_test(
 
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
-    record_bitw8_image_to_nchw_nobitw8buffer_op(
-        context(), vten, staging_buffer);
+    record_int8_image_to_nchw_noint8_op(context(), vten, staging_buffer);
   } else {
     record_image_to_nchw_op(context(), vten, staging_buffer.buffer());
   }
@@ -2413,8 +2412,7 @@ void round_trip_test(
   // Copy data in and out of the tensor
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
-    record_bitw8_image_to_nchw_nobitw8buffer_op(
-        context(), vten, staging_buffer_out);
+    record_int8_image_to_nchw_noint8_op(context(), vten, staging_buffer_out);
   } else {
     record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer());
   }