Adding Tiled 2D and 3D Quantizer Linear Base Implementation (#5492)

Kush Rastogi · facebook-github-bot · commit f8c5a60c7e50 · 2024-09-19T11:00:06.000-07:00
Summary: Pull Request resolved: #5492 Adding Tiled Implementation of Weight-Only Quantized Linear operator This diff adds Texture Implementation, will add Buffer impl next. # Diff Stack 1. Add Tiled Implementation of Weight-Only Quantized Linear 2. Add Optimized Quantized Linear Shader and code to invoke shader from Quantized Linear CPP operator 3. [Will Not Land] Use Optimized Quantized Linear implementation Differential Revision: D61309097
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -8,6 +8,8 @@
 
 #version 450 core
 
+#extension GL_EXT_control_flow_attributes : require
+
 #define PRECISION ${PRECISION}
 
 #define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_linear.h b/backends/vulkan/runtime/graph/ops/glsl/q_linear.h
@@ -11,6 +11,24 @@
 
 #include "indexing_utils.h"
 
+// To convince the SPIR-V compiler to unroll the loops optimally, need this
+// macro
+#define FOUR 4
+
+#ifdef TILE_ROW_2
+#define TILE_ROWS 2
+#else
+#define TILE_ROWS 4
+#endif
+
+struct FloatMatrix_2d {
+  float data[TILE_ROWS][FOUR];
+};
+
+struct FloatMatrix_3d {
+  float data[TILE_ROWS][FOUR][FOUR];
+};
+
 // The functions in this file assume that some variables have been defined as
 // descriptors, such as t_mat1, t_qmat2, t_scales, etc.
 
@@ -77,6 +95,101 @@ VEC4_T q_8w_linear(const ivec3 out_pos, const int K) {
   return outtex;
 }
 
+FloatMatrix_2d q_8w_linear_optimized_2d(const ivec3 out_pos, const int K) {
+  FloatMatrix_2d results;
+
+  VEC4_T im_mat1_partial_load[TILE_ROWS];
+  ivec4 im_mat2_partial_load[FOUR];
+
+  [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
+    [[unroll]] for (int j = 0; j < FOUR; j++) { results.data[i][j] = 0.0f; }
+  }
+
+  for (int mat1_x = 0; mat1_x < K; mat1_x++) {
+    [[unroll]] for (int offset = 0; offset < TILE_ROWS; offset++) {
+      const int mat1_y = out_pos.y * TILE_ROWS + offset;
+      const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, 0);
+      im_mat1_partial_load[offset] = load_texel(t_mat1, mat1_pos);
+    }
+    [[unroll]] for (int offset = 0; offset < FOUR; offset++) {
+      const int mat2_y = (FOUR * out_pos.x) + offset;
+      const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0);
+      im_mat2_partial_load[offset] = load_texel(t_qmat2, mat2_pos);
+    }
+
+    [[unroll]] for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
+      [[unroll]] for (int out_col = 0; out_col < FOUR; out_col++) {
+        results.data[out_row][out_col] +=
+            dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]);
+      }
+    }
+  }
+
+  const VEC4_T scales = load_texel(t_scales, ivec3(out_pos.x, 0, 0));
+  [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
+    [[unroll]] for (int j = 0; j < FOUR; j++) {
+      results.data[i][j] *= scales[j];
+    }
+  }
+  return results;
+}
+
+FloatMatrix_3d q_8w_linear_optimized_3d(
+    const ivec3 out_pos,
+    const int K,
+    const int batch_size) {
+  FloatMatrix_3d results;
+
+  [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
+    [[unroll]] for (int j = 0; j < FOUR; j++) {
+      [[unroll]] for (int k = 0; k < FOUR; k++) {
+        results.data[i][j][k] = 0.0f;
+      }
+    }
+  }
+
+  VEC4_T im_mat1_partial_load[TILE_ROWS];
+  ivec4 im_mat2_partial_load[FOUR];
+
+  const VEC4_T scales = load_texel(t_scales, ivec3(out_pos.x, 0, 0));
+
+  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
+    if (FOUR * out_pos.z + batch_idx >= batch_size) {
+      break;
+    }
+    int mat_z = FOUR * out_pos.z + batch_idx;
+    for (int mat1_x = 0; mat1_x < K; mat1_x++) {
+      [[unroll]] for (int offset = 0; offset < TILE_ROWS; offset++) {
+        // read and cache 2x4 (or 4x4) tile of im_mat1
+        const int mat1_y = (TILE_ROWS * out_pos.y) + offset;
+        const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, mat_z);
+        im_mat1_partial_load[offset] = load_texel(t_mat1, mat1_pos);
+      }
+
+      [[unroll]] for (int offset = 0; offset < FOUR; offset++) {
+        // read and cache 4x4 tile of im_mat2
+        const int mat2_y = (FOUR * out_pos.x) + offset;
+        const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0);
+        im_mat2_partial_load[offset] = load_texel(t_qmat2, mat2_pos);
+      }
+
+      [[unroll]] for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
+        [[unroll]] for (int out_col = 0; out_col < FOUR; out_col++) {
+          results.data[out_row][out_col][batch_idx] +=
+              dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]);
+        }
+      }
+    }
+
+    [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
+      [[unroll]] for (int j = 0; j < FOUR; j++) {
+        results.data[i][j][batch_idx] *= scales[j];
+      }
+    }
+  }
+  return results;
+}
+
 #endif // USING_BUFFER
 
 #endif // Q_LINEAR_H