vulkan: initial support for IQ3_XXS

remyoudompheng · remyoudompheng · commit c03579df408f · 2025-01-23T21:14:31.000+01:00
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
@@ -88,6 +88,54 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 }
 #endif
 
+#if defined(DATA_A_IQ3_XXS)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const uint ib4 = iqs / 4;
+    const uint ib32 = iqs / 32;
+    const uint is = QUANT_K / 4 + 4 * ib32;
+    const uint qs = data_a[a_offset + ib].qs[ib4];
+    // Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
+    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[is / 2],
+        data_a_packed16[a_offset + ib].qs[is / 2 + 1]));
+    const float db = 0.5 * (0.5 + (signs >> 28));
+    const uint sign7 = bitfieldExtract(signs, 7 * (int(ib4 / 2) % 4), 7);
+    // Add parity bit
+    const uint sign8 = sign7 | (bitCount(sign7) << 7);
+    const uint sign = sign8 >> (iqs % 8);
+    const u8vec4 grid = unpack8(iq3xxs_grid[qs] >> (8 * (iqs % 4)));
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    return db * vec2(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0)
+    );
+}
+vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
+    const uint ib4 = iqs / 4;
+    const uint ib32 = iqs / 32;
+    const uint is = QUANT_K / 4 + 4 * ib32;
+    const uint qs = data_a[a_offset + ib].qs[ib4];
+    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[is / 2],
+        data_a_packed16[a_offset + ib].qs[is / 2 + 1]));
+    const float db = 0.5 * (0.5 + (signs >> 28));
+    const uint sign7 = bitfieldExtract(signs, 7 * (int(ib4 / 2) % 4), 7);
+    // Add parity bit
+    const uint sign8 = sign7 | (bitCount(sign7) << 7);
+    const uint sign = sign8 >> (iqs % 8);
+    const u8vec4 grid = unpack8(iq3xxs_grid[qs]);
+    bool sign0 = (sign & 1) != 0;
+    bool sign1 = (sign & 2) != 0;
+    bool sign2 = (sign & 4) != 0;
+    bool sign3 = (sign & 8) != 0;
+    return db * vec4(
+        grid.x * (sign0 ? -1.0 : 1.0),
+        grid.y * (sign1 ? -1.0 : 1.0),
+        grid.z * (sign2 ? -1.0 : 1.0),
+        grid.w * (sign3 ? -1.0 : 1.0)
+    );
+}
+#endif
+
 #if defined(DATA_A_IQ3_S)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     const uint qs = data_a[a_offset + ib].qs[iqs / 4];
@@ -142,7 +190,7 @@ vec2 get_dm(uint ib, uint a_offset) {
 }
 #endif
 
-#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
+#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_NL)
 vec2 get_dm(uint ib, uint a_offset) {
     return vec2(float(data_a[a_offset + ib].d), 0);
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp
@@ -0,0 +1,49 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq3_xxs data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    // Each thread handles 1 scale block (32 values)
+    // 8 threads handle 1 superblock
+    const uint ib = gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x / 8;
+
+    init_iq3xxs_shmem();
+
+    if (ib >= p.nel / 256) {
+        return;
+    }
+
+    const uint is = gl_LocalInvocationID.x % 8;
+    const uint b_idx = 256 * ib + 32 * is;
+    const uint s_idx = QUANT_K / 4 + 4 * is;
+
+    const float d = float(data_a[ib].d);
+    uint signscale = pack32(u8vec4(
+        data_a[ib].qs[s_idx + 0],
+        data_a[ib].qs[s_idx + 1],
+        data_a[ib].qs[s_idx + 2],
+        data_a[ib].qs[s_idx + 3]
+    ));
+    const float db = d * (0.5 + (signscale >> 28)) * 0.5;
+
+    [[unroll]] for (uint l = 0; l < 4; ++l) {
+        const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
+        // Restore parity bit.
+        const uint sign8 = sign7 | (bitCount(sign7) << 7);
+        const u8vec4 grid0 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l]]);
+        const u8vec4 grid1 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l + 1]]);
+        data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 1] = D_TYPE(db * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 2] = D_TYPE(db * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 3] = D_TYPE(db * grid0.w * ((sign8 & 8) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 4] = D_TYPE(db * grid1.x * ((sign8 & 16) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 5] = D_TYPE(db * grid1.y * ((sign8 & 32) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 6] = D_TYPE(db * grid1.z * ((sign8 & 64) != 0 ? -1.0 : 1.0));
+        data_b[b_idx + 8 * l + 7] = D_TYPE(db * grid1.w * ((sign8 & 128) != 0 ? -1.0 : 1.0));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
@@ -12,7 +12,9 @@ void main() {
     const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
     const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
 
-#if defined(DATA_A_IQ3_S)
+#if defined(DATA_A_IQ3_XXS)
+    init_iq3xxs_shmem();
+#elif defined(DATA_A_IQ3_S)
     init_iq3s_shmem();
 #elif defined(DATA_A_IQ4_NL)
     init_iq4nl_shmem();
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@@ -133,7 +133,9 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 void main() {
     const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
 
-#if defined(DATA_A_IQ3_S)
+#if defined(DATA_A_IQ3_XXS)
+    init_iq3xxs_shmem();
+#elif defined(DATA_A_IQ3_S)
     init_iq3s_shmem();
 #elif defined(DATA_A_IQ4_NL)
     init_iq4nl_shmem();
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -95,7 +95,9 @@ shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
 #endif
 
 void main() {
-#if defined(DATA_A_IQ3_S)
+#if defined(DATA_A_IQ3_XXS)
+    init_iq3xxs_shmem();
+#elif defined(DATA_A_IQ3_S)
     init_iq3s_shmem();
 #elif defined(DATA_A_IQ4_NL)
     init_iq4nl_shmem();
@@ -441,6 +443,31 @@ void main() {
 
             buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
             buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
+#elif defined(DATA_A_IQ3_XXS)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint iqs = (idx % 128) / 2;           // 0..63
+            const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
+
+            const float d = float(data_a[ib].d);
+            const uint qs = data_a[ib].qs[iqs];
+            const uint signs = pack32(u8vec4(
+                data_a[ib].qs[is+0],
+                data_a[ib].qs[is+1],
+                data_a[ib].qs[is+2],
+                data_a[ib].qs[is+3]
+            ));
+            const float db = d * 0.5 * (0.5 + (signs >> 28));
+            const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
+            const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4));
+            const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign))));
+            const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1));
+            const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
 #elif defined(DATA_A_IQ3_S)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
             const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@@ -106,7 +106,9 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem
 #endif
 
 void main() {
-#if defined(DATA_A_IQ3_S)
+#if defined(DATA_A_IQ3_XXS)
+    init_iq3xxs_shmem();
+#elif defined(DATA_A_IQ3_S)
     init_iq3s_shmem();
 #elif defined(DATA_A_IQ4_NL)
     init_iq4nl_shmem();
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -294,6 +294,77 @@ struct block_q6_K_packed16
 
 // IQuants
 
+#define QUANT_K_IQ3_XXS 256
+#define QUANT_R_IQ3_XXS 1
+
+struct block_iq3_xxs
+{
+    float16_t d;
+    uint8_t qs[QUANT_K_IQ3_XXS/4 + QUANT_K_IQ3_XXS/8];
+};
+
+struct block_iq3_xxs_packed16
+{
+    float16_t d;
+    uint16_t qs[QUANT_K_IQ3_XXS/8 + QUANT_K_IQ3_XXS/16];
+};
+
+#if defined(DATA_A_IQ3_XXS)
+
+const uint32_t iq3xxs_grid_const[256] = {
+    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
+    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
+    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
+    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
+    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
+    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
+    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
+    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
+    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
+    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
+    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
+    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
+    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
+    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
+    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
+    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
+    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
+    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
+    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
+    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
+    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
+    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
+    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
+    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
+    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
+    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
+    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
+    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
+    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
+    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
+    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
+    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
+};
+
+shared uint32_t iq3xxs_grid[512];
+
+void init_iq3xxs_shmem()
+{
+    // copy the table into shared memory and sync
+    if (gl_LocalInvocationIndex.x < 32) {
+        for (uint i = gl_LocalInvocationIndex.x; i < 512; i += 32) {
+            iq3xxs_grid[i] = iq3xxs_grid_const[i];
+        }
+    }
+    barrier();
+}
+
+#define QUANT_K QUANT_K_IQ3_XXS
+#define QUANT_R QUANT_R_IQ3_XXS
+#define A_TYPE block_iq3_xxs
+#define A_TYPE_PACKED16 block_iq3_xxs_packed16
+#endif
+
 #define QUANT_K_IQ3_S 256
 #define QUANT_R_IQ3_S 1
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -55,6 +55,7 @@ const std::vector<std::string> type_names = {
     "q4_k",
     "q5_k",
     "q6_k",
+    "iq3_xxs",
     "iq3_s",
     "iq4_nl"
 };
@@ -312,6 +313,7 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
 
     for (const auto& tname : type_names) {
         if (tname == "iq3_s" && coopmat2) continue;
+        if (tname == "iq3_xxs" && coopmat2) continue;
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
         // For unaligned, load one at a time for f32/f16, or two at a time for quants
         std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2";
@@ -367,7 +369,7 @@ void process_shaders() {
             if (tname == "f32") {
                 continue;
             }
-            if (tname == "iq3_s") {
+            if (tname == "iq3_s" || tname == "iq3_xxs") {
                 continue;
             }
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -3919,6 +3919,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_K,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
         test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q5_K,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
         test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q6_K,   GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
+        test_cases.emplace_back(new test_mul_mat(GGML_TYPE_IQ3_XXS,GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
         test_cases.emplace_back(new test_mul_mat(GGML_TYPE_IQ3_S,  GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
         test_cases.emplace_back(new test_mul_mat(GGML_TYPE_IQ4_NL, GGML_TYPE_F32, 16,  i, 256, { 1,  1}, {1, 1}));
     }

Original file line number	Diff line number	Diff line change
`@@ -3919,6 +3919,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {`
`3919`	`3919`	`test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_K, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));`
`3920`	`3920`	`test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q5_K, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));`
`3921`	`3921`	`test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q6_K, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));`
	`3922`	`+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_IQ3_XXS,GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));`
`3922`	`3923`	`test_cases.emplace_back(new test_mul_mat(GGML_TYPE_IQ3_S, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));`
`3923`	`3924`	`test_cases.emplace_back(new test_mul_mat(GGML_TYPE_IQ4_NL, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));`
`3924`	`3925`	`}`