[ET-VK] Integrate axis mapping into staging <-> image transfer shaders

SS-JIA · SS-JIA · commit f549d9753726 · 2024-09-05T14:32:05.000-07:00
Pull Request resolved: #5093 ## Context Building on the previous diff, this diff integrates axis mapping into staging <-> image transfer shaders. Alternative versions of indexing utility functions are introduced to account for axis mapping. The impact of shader latency of using axis mapping on transfer shaders is examined in the next diff. Differential Revision: [D62210117](https://our.internmc.facebook.com/intern/diff/D62210117/) ghstack-source-id: 241249802
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_buffer(0, "w", "nchw_out", DTYPE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "sizes")}
+${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -51,7 +52,7 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
 
   if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -183,6 +183,42 @@ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
   return tensor_idx;
 }
 
+/*
+ * Derive (w,h,c,n) tensor indices from (x,y,z) texture position using axis
+ * mapping.
+ */
+ivec4 to_tensor_idx(
+    ivec3 pos,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  // Packed dim contains 4 elements per texel, so moving 1 unit traverses 4
+  // elements in the tensor.
+  pos[axis_mapping[packed_dim]] *= 4;
+
+  ivec4 tensor_idx;
+  for (int dim = 0; dim < 3; ++dim) {
+    tensor_idx[dim] = pos[axis_mapping[dim]];
+  }
+
+  // Early return if batch is 1. Batch index will be 0.
+  if (sizes.w == 1) {
+    tensor_idx.w = 0;
+    return tensor_idx;
+  }
+
+  // Else, adjust the dim that's concatenated with batch. Note that the axis
+  // mapping for the batch dim indicates WHCN dim index of the dim that it is
+  // concatenated with, not a texture axis.
+  tensor_idx.w = tensor_idx[axis_mapping[3]] / sizes[axis_mapping[3]];
+  tensor_idx[axis_mapping[3]] %= sizes[axis_mapping[3]];
+
+  return tensor_idx;
+}
+
 /*
  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
  *        is packed along a texel
@@ -199,6 +235,34 @@ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
+/*
+ * Derive (x,y,z) texture position from (w,h,c,n) tensor indices using axis
+ * mapping.
+ */
+ivec3 to_texture_pos(
+    const ivec4 idx,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec3 pos;
+  for (int dim = 0; dim < 3; ++dim) {
+    pos[axis_mapping[dim]] = idx[dim];
+  }
+
+  // Adjust batch dim if needed
+  if (sizes.w > 1) {
+    pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
+  }
+
+  // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
+  // tensor elements in that dim.
+  pos[axis_mapping[packed_dim]] /= 4;
+  return pos;
+}
+
 /*
  * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
  *        is packed along a texel
@@ -218,6 +282,35 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   return pos;
 }
 
+/*
+ * Derive (x,y,z,i) texel element position from the (w,h,c,n) tensor index using
+ * the axis mapping.
+ */
+ivec4 to_texture_elem_pos(
+    const ivec4 idx,
+    ivec4 sizes,
+    const ivec4 axis_mapping,
+    const int packed_dim) {
+  // Align packed dim to next multiple of 4 to account for texel padding
+  sizes[packed_dim] = alignup4(sizes[packed_dim]);
+
+  ivec4 pos;
+  for (int dim = 0; dim < 3; ++dim) {
+    pos[axis_mapping[dim]] = idx[dim];
+  }
+
+  // Adjust batch dim if needed
+  if (sizes.w > 1) {
+    pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
+  }
+
+  // Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
+  // tensor elements in that dim.
+  pos[axis_mapping[packed_dim]] /= 4;
+  pos.w = idx[packed_dim] % 4;
+  return pos;
+}
+
 //
 // Texel Access and Storage
 //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl
@@ -16,10 +16,11 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_buffer(0, "w", "nchw_out", "int")}
-${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")}
-${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
-${layout_declare_ubo(3, "int", "out_numel")}
+${layout_declare_buffer(B, "w", "nchw_out", "int")}
+${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")}
+${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
+${layout_declare_ubo(B, "int", "out_numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_buffer(1, "r", "nchw_in", DTYPE)}
-${layout_declare_ubo(2, "ivec4", "sizes")}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_buffer(B, "r", "nchw_in", DTYPE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -53,7 +54,7 @@ VEC4_T read_texel(ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
   if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl
@@ -16,9 +16,10 @@ layout(std430) buffer;
 
 #extension GL_EXT_control_flow_attributes : require
 
-${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
-${layout_declare_buffer(1, "r", "nchw_in", "int")}
-${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
+${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")}
+${layout_declare_buffer(B, "r", "nchw_in", "int")}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "ivec4", "axis_mapping")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -36,7 +37,7 @@ int extend_sign(int x) {
 
 ivec4 read_texel(ivec4 tensor_idx) {
   const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
-      tensor_idx, tensor_sizes, packed_dim);
+      tensor_idx, sizes, packed_dim);
 
   int shift = (1 << 8) - 1;
   ivec4 masks;
@@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) {
   ivec4 out_tex = ivec4(0);
 
   [[unroll]] for (int i = 0; i < 4; ++i) {
-    if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
+    if (tensor_idx[packed_dim] + i < sizes[packed_dim]) {
       int in_texel = nchw_in[buf_indices[i] / 4];
       int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
       extracted_val = extend_sign(extracted_val);
@@ -64,9 +65,9 @@ ivec4 read_texel(ivec4 tensor_idx) {
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);
+  const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
 
-  if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
+  if (any(greaterThanEqual(tensor_idx, sizes))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -106,7 +106,7 @@ ValueRef prepack_biases(
       graph.create_local_wg_size(v),
       vref,
       v,
-      {t->sizes_ubo()},
+      {t->sizes_ubo(), t->axis_mapping_ubo()},
       // Specialization constants
       {SV(t->packed_dim_whcn_idx())}));
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -31,7 +31,8 @@ void add_staging_to_tensor_node(
          graph.strides_ubo(out_tensor),
          graph.numel_ubo(out_tensor)});
   } else {
-    ubos.append(graph.sizes_ubo(out_tensor));
+    ubos.append(
+        {graph.sizes_ubo(out_tensor), graph.axis_mapping_ubo(out_tensor)});
   }
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
@@ -69,7 +70,8 @@ void add_tensor_to_staging_node(
          graph.strides_ubo(in_tensor),
          graph.numel_ubo(in_tensor)});
   } else {
-    ubos.append(graph.sizes_ubo(in_tensor));
+    ubos.append(
+        {graph.sizes_ubo(in_tensor), graph.axis_mapping_ubo(in_tensor)});
   }
 
   // Normally, the image_to_nchw shader is structured so that each thread reads
@@ -113,7 +115,7 @@ ValueRef prepack(
   if (graph.is_buffer_storage(v)) {
     ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)});
   } else {
-    ubos.append(graph.sizes_ubo(v));
+    ubos.append({graph.sizes_ubo(v), graph.axis_mapping_ubo(v)});
   }
 
   graph.prepack_nodes().emplace_back(new PrepackNode(
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -85,7 +85,8 @@ void record_nchw_to_image_op(
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
-      v_dst.sizes_ubo());
+      v_dst.sizes_ubo(),
+      v_dst.axis_mapping_ubo());
 }
 
 void record_image_to_nchw_op(
@@ -106,7 +107,8 @@ void record_image_to_nchw_op(
       0,
       dst_buffer,
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.sizes_ubo());
+      v_src.sizes_ubo(),
+      v_src.axis_mapping_ubo());
 }
 
 void record_int8_image_to_nchw_noint8_op(
@@ -127,6 +129,7 @@ void record_int8_image_to_nchw_noint8_op(
       dst_buffer.buffer(),
       v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
       v_src.sizes_ubo(),
+      v_src.axis_mapping_ubo(),
       v_src.numel_ubo());
 }
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1227,8 +1227,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
   GraphConfig config;
   ComputeGraph graph(config);
 
-  std::vector<int64_t> size_big = {8, 64, 124};
-  std::vector<int64_t> size_small = {8, 1, 124};
+  std::vector<int64_t> size_big = {1, 8, 8};
+  std::vector<int64_t> size_small = {1, 1, 8};
 
   // Build graph
 
@@ -1409,8 +1409,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       /*shared_object_idx = */ 4);
 
   // +2: t.sizes_ubo() for each staging shader
+  // +2: t.axis_mapping_ubo() for each staging shader
   // +2: staging buffer for each input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 4);
+  EXPECT_TRUE(get_vma_allocation_count() == 6);
 
   ValueRef c = graph.add_tensor(
       size_big,
@@ -1427,8 +1428,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() uniform buffer for staging shader
+  // +1: t.axis_mapping_ubo() uniform buffer for staging shader
   // +1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 9);
+  EXPECT_TRUE(get_vma_allocation_count() == 12);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -1444,14 +1446,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
 
   // +2: alpha UBO, broadcast UBO for arithmetic shader
   // +1: t.sizes_ubo() for staging shader
+  // +1: t.axis_mapping_ubo() for staging shader
   // +1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 13);
+  EXPECT_TRUE(get_vma_allocation_count() == 17);
 
   graph.prepare();
   graph.encode_execute();
 
   // +3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 16);
+  EXPECT_TRUE(get_vma_allocation_count() == 20);
 
   // Run graph