Update on "[ET-VK] Modify quantized linear tiling shader to linearly dispatch work to improve thread occupancy and performance."

trivedivivek · trivedivivek · commit 6bac0171ba36 · 2025-04-28T10:52:20.000-07:00
This diff changes tiled 8 bit quantized linear mat mul op to linearly dispatch work which increases thread occupancy and improves performance. Differential Revision: [D73751979](https://our.internmc.facebook.com/intern/diff/D73751979/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl
@@ -38,18 +38,21 @@ layout(push_constant) uniform restrict Block {
   ivec4 weight_sizes;
 };
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 shared VEC4_T partial_c[NGROUPS][NWORKERS][TILE_ROWS];
 
 void main() {
-  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
-  const uint out_col = gl_GlobalInvocationID.x << 2;
+  const uint out_width_ntexels = divup4(out_sizes.x);
+  const uint out_col = (gl_GlobalInvocationID.x % out_width_ntexels) << 2;
+  const uint out_row = (gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS;
 
   const int gid = int(gl_LocalInvocationID.x); // group id
   const int wid = int(gl_LocalInvocationID.z); // worker id
 
-  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
+  if (out_row >= out_sizes.y) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
@@ -41,9 +41,9 @@ layout(push_constant) uniform restrict Block {
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
-  const uint out_size_x_div_4 = divup4(out_sizes.x);
-  const uint out_col = (gl_GlobalInvocationID.x % out_size_x_div_4) << 2;
-  const uint out_row = (gl_GlobalInvocationID.x / out_size_x_div_4) * TILE_ROWS;
+  const uint out_width_ntexels = divup4(out_sizes.x);
+  const uint out_col = (gl_GlobalInvocationID.x % out_width_ntexels) << 2;
+  const uint out_row = (gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS;
 
   if (out_row >= out_sizes.y) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
@@ -195,12 +195,11 @@ void add_q_8w_linear_tiled_node(
     out_tile_nrows = 4;
   }
 
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  global_wg_size[1] = global_wg_size[1] / out_tile_nrows;
-  if (!use_coop_algorithm) {
-    global_wg_size[0] *= global_wg_size[1];
-    global_wg_size[1] = 1;
-  }
+  utils::uvec3 out_limits = graph.logical_limits_of(out);
+  utils::uvec3 global_wg_size = {
+      out_limits[0] * (utils::div_up(out_limits, out_tile_nrows)),
+      1,
+      out_limit[2]};
 
   utils::uvec3 local_wg_size{64, 1, 1};
   if (use_coop_algorithm) {