[ET-VK] Integrate axis mapping into staging <-> image transfer shaders

SS-JIA · web-flow · commit 41ec7fa1a7bc · 2024-09-06T09:22:34.000-07:00
Differential Revision: D62210117 Pull Request resolved: #5093
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_buffer(0, "w", "nchw_out", DTYPE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "sizes")}
+${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -51,7 +52,7 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
 
   if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -183,6 +183,42 @@ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
   return tensor_idx;
 }
 
+/*
+ * Derive (w,h,c,n) tensor indices from (x,y,z) texture position using axis
+ * mapping.
+ */
+ivec4 to_tensor_idx(
+    ivec3 pos,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  // Packed dim contains 4 elements per texel, so moving 1 unit traverses 4
+  // elements in the tensor.
+  pos[axis_mapping[packed_dim]] *= 4;
+
+  ivec4 tensor_idx;
+  for (int dim = 0; dim < 3; ++dim) {
+    tensor_idx[dim] = pos[axis_mapping[dim]];
+  }
+
+  // Early return if batch is 1. Batch index will be 0.
+  if (sizes.w == 1) {
+    tensor_idx.w = 0;
+    return tensor_idx;
+  }
+
+  // Else, adjust the dim that's concatenated with batch. Note that the axis
+  // mapping for the batch dim indicates WHCN dim index of the dim that it is
+  // concatenated with, not a texture axis.
+  tensor_idx.w = tensor_idx[axis_mapping[3]] / sizes[axis_mapping[3]];
+  tensor_idx[axis_mapping[3]] %= sizes[axis_mapping[3]];
+
+  return tensor_idx;
+}
+
 /*
  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
  *        is packed along a texel
@@ -199,6 +235,34 @@ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
+/*
+ * Derive (x,y,z) texture position from (w,h,c,n) tensor indices using axis
+ * mapping.
+ */
+ivec3 to_texture_pos(
+    const ivec4 idx,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 pos;
+  for (int dim = 0; dim < 3; ++dim) {
+    pos[axis_mapping[dim]] = idx[dim];
+  }
+
+  // Adjust batch dim if needed
+  if (sizes.w > 1) {
+    pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
+  }
+
+  // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
+  // tensor elements in that dim.
+  pos[axis_mapping[packed_dim]] /= 4;
+  return pos;
+}
+
 /*
  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
  *        is packed along a texel
@@ -218,6 +282,35 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
+/*
+ * Derive (x,y,z,i) texel element position from the (w,h,c,n) tensor index using
+ * the axis mapping.
+ */
+ivec4 to_texture_elem_pos(
+    const ivec4 idx,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec4 pos;
+  for (int dim = 0; dim < 3; ++dim) {
+    pos[axis_mapping[dim]] = idx[dim];
+  }
+
+  // Adjust batch dim if needed
+  if (sizes.w > 1) {
+    pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
+  }
+
+  // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
+  // tensor elements in that dim.
+  pos[axis_mapping[packed_dim]] /= 4;
+  pos.w = idx[packed_dim] % 4;
+  return pos;
+}
+
 //
 // Texel Access and Storage
 //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
@@ -16,10 +16,11 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_buffer(0, "w", "nchw_out", "int")}
-${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")}
-${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
-${layout_declare_ubo(3, "int", "out_numel")}
+${layout_declare_buffer(B, "w", "nchw_out", "int")}
+${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")}
+${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
+${layout_declare_ubo(B, "int", "out_numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_buffer(1, "r", "nchw_in", DTYPE)}
-${layout_declare_ubo(2, "ivec4", "sizes")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_buffer(B, "r", "nchw_in", DTYPE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -53,7 +54,7 @@ VEC4_T read_texel(ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
   if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
@@ -16,9 +16,10 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
-${layout_declare_buffer(1, "r", "nchw_in", "int")}
-${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")}
+${layout_declare_buffer(B, "r", "nchw_in", "int")}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -36,7 +37,7 @@ int extend_sign(int x) {
 
 ivec4 read_texel(ivec4 tensor_idx) {
   const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
-      tensor_idx, tensor_sizes, packed_dim);
+      tensor_idx, sizes, packed_dim);
 
   int shift = (1 << 8) - 1;
   ivec4 masks;
@@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) {
   ivec4 out_tex = ivec4(0);
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
-    if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
+    if (tensor_idx[packed_dim] + i < sizes[packed_dim]) {
       int in_texel = nchw_in[buf_indices[i] / 4];
       int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
       extracted_val = extend_sign(extracted_val);
@@ -64,9 +65,9 @@ ivec4 read_texel(ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
 
-  if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
+  if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -106,7 +106,7 @@ ValueRef prepack_biases(
       graph.create_local_wg_size(v),
       vref,
       v,
-      {t->sizes_ubo()},
+      {t->sizes_ubo(), t->axis_mapping_ubo()},
       // Specialization constants
       {SV(t->packed_dim_whcn_idx())}));
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -31,7 +31,8 @@ void add_staging_to_tensor_node(
          graph.strides_ubo(out_tensor),
          graph.numel_ubo(out_tensor)});
   } else {
-    ubos.append(graph.sizes_ubo(out_tensor));
+    ubos.append(
+        {graph.sizes_ubo(out_tensor), graph.axis_mapping_ubo(out_tensor)});
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -69,7 +70,8 @@ void add_tensor_to_staging_node(
          graph.strides_ubo(in_tensor),
          graph.numel_ubo(in_tensor)});
   } else {
-    ubos.append(graph.sizes_ubo(in_tensor));
+    ubos.append(
+        {graph.sizes_ubo(in_tensor), graph.axis_mapping_ubo(in_tensor)});
   }
 
   // Normally, the image_to_nchw shader is structured so that each thread reads
@@ -113,7 +115,7 @@ ValueRef prepack(
   if (graph.is_buffer_storage(v)) {
     ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)});
   } else {
-    ubos.append(graph.sizes_ubo(v));
+    ubos.append({graph.sizes_ubo(v), graph.axis_mapping_ubo(v)});
   }
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -85,7 +85,8 @@ void record_nchw_to_image_op(
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
-      v_dst.sizes_ubo());
+      v_dst.sizes_ubo(),
+      v_dst.axis_mapping_ubo());
 }
 
 void record_image_to_nchw_op(
@@ -106,7 +107,8 @@ void record_image_to_nchw_op(
       0,
       dst_buffer,
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.sizes_ubo());
+      v_src.sizes_ubo(),
+      v_src.axis_mapping_ubo());
 }
 
 void record_int8_image_to_nchw_noint8_op(
@@ -127,6 +129,7 @@ void record_int8_image_to_nchw_noint8_op(
       dst_buffer.buffer(),
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
+      v_src.axis_mapping_ubo(),
       v_src.numel_ubo());
 }
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1233,8 +1233,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
   GraphConfig config;
   ComputeGraph graph(config);
 
-  std::vector<int64_t> size_big = {8, 64, 124};
-  std::vector<int64_t> size_small = {8, 1, 124};
+  std::vector<int64_t> size_big = {1, 8, 8};
+  std::vector<int64_t> size_small = {1, 1, 8};
 
   // Build graph
 
@@ -1415,8 +1415,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 4);
 
   // +2: t.sizes_ubo() for each staging shader
+  // +2: t.axis_mapping_ubo() for each staging shader
   // +2: staging buffer for each input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 4);
+  EXPECT_TRUE(get_vma_allocation_count() == 6);
 
   ValueRef c = graph.add_tensor(
       size_big,
@@ -1433,8 +1434,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() uniform buffer for staging shader
+  // +1: t.axis_mapping_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 9);
+  EXPECT_TRUE(get_vma_allocation_count() == 12);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -1450,14 +1452,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() for staging shader
+  // +1: t.axis_mapping_ubo() for staging shader
   // +1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 13);
+  EXPECT_TRUE(get_vma_allocation_count() == 17);
 
   graph.prepare();
   graph.encode_execute();
 
   // +3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 16);
+  EXPECT_TRUE(get_vma_allocation_count() == 20);
 
   // Run graph