metal : use dequantize_q templates

ggerganov · ggerganov · commit be1542e710eb · 2025-02-22T11:51:36.000+02:00
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -84,7 +84,6 @@ typedef struct {
 } ggml_metal_kargs_repeat;
 
 typedef struct {
-    int64_t  ne;
     int64_t  ne00;
     int64_t  ne01;
     int64_t  ne02;
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
@@ -408,10 +408,15 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,
     GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
     GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16,
     GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16,
     GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16,
     GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16,
     GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16,
     GGML_METAL_KERNEL_TYPE_CONCAT,
     GGML_METAL_KERNEL_TYPE_SQR,
     GGML_METAL_KERNEL_TYPE_SQRT,
@@ -1018,10 +1023,15 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,                  cpy_f32_q5_1,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,                cpy_f32_iq4_nl,                 true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32,                  cpy_q4_0_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16,                  cpy_q4_0_f16,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32,                  cpy_q4_1_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16,                  cpy_q4_1_f16,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32,                  cpy_q5_0_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16,                  cpy_q5_0_f16,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32,                  cpy_q5_1_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16,                  cpy_q5_1_f16,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32,                  cpy_q8_0_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16,                  cpy_q8_0_f16,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                        concat,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                           sqr,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                          sqrt,                           true);
@@ -1302,7 +1312,13 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                     case GGML_TYPE_Q5_0:
                     case GGML_TYPE_Q5_1:
                     case GGML_TYPE_Q8_0:
-                        return (op->type == GGML_TYPE_F32);
+                        switch (op->type) {
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_F16:
+                                return true;
+                            default:
+                                return false;
+                        }
                     default:
                         return false;
                 };
@@ -1631,10 +1647,7 @@ static void ggml_metal_encode_node(
 
                     const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
 
-                    const int64_t ne = ggml_nelements(src0);
-
                     ggml_metal_kargs_cpy args = {
-                        /*.ne   =*/ ne,
                         /*.ne00 =*/ ne00,
                         /*.ne01 =*/ ne01,
                         /*.ne02 =*/ ne02,
@@ -3918,7 +3931,6 @@ static void ggml_metal_encode_node(
         case GGML_OP_CPY:
         case GGML_OP_CONT:
             {
-                const int64_t ne = ggml_nelements(src0);
                 id<MTLComputePipelineState> pipeline = nil;
 
                 switch (src0t) {
@@ -3956,29 +3968,49 @@ static void ggml_metal_encode_node(
                             };
                         } break;
                     case GGML_TYPE_Q4_0:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
                     case GGML_TYPE_Q4_1:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
                     case GGML_TYPE_Q5_0:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
                     case GGML_TYPE_Q5_1:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
                     case GGML_TYPE_Q8_0:
                         {
-                           if (dstt == GGML_TYPE_F32) {
-                                switch (src0t) {
-                                    case GGML_TYPE_Q4_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32].pipeline; break;
-                                    case GGML_TYPE_Q4_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32].pipeline; break;
-                                    case GGML_TYPE_Q5_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32].pipeline; break;
-                                    case GGML_TYPE_Q5_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32].pipeline; break;
-                                    case GGML_TYPE_Q8_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32].pipeline; break;
-                                    default: GGML_ABORT("not implemented");
-                                }
-                           } else {
-                                GGML_ABORT("not implemented");
-                           }
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
                         } break;
                     default: GGML_ABORT("not implemented");
                 }
 
                 ggml_metal_kargs_cpy args = {
-                    /*.ne   =*/ ne,
                     /*.ne00 =*/ ne00,
                     /*.ne01 =*/ ne01,
                     /*.ne02 =*/ ne02,
@@ -4002,19 +4034,9 @@ static void ggml_metal_encode_node(
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
-                int nth;
+                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+                int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
 
-                if (   src0t == GGML_TYPE_Q4_0
-                    || src0t == GGML_TYPE_Q4_1
-                    || src0t == GGML_TYPE_Q5_0
-                    || src0t == GGML_TYPE_Q5_1
-                    || src0t == GGML_TYPE_Q8_0) {
-                    GGML_ASSERT(dstt == GGML_TYPE_F32);
-                    nth = MIN(1024, ne);
-                } else {
-                    GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
-                    nth = MIN(1024, ne00/ggml_blck_size(src0->type));
-                }
                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
 
             } break;
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4341,6 +4341,49 @@ kernel void kernel_cpy_f32_iq4_nl(
     }
 }
 
+template<typename T4x4, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
+kernel void kernel_cpy_q_f32(
+        constant ggml_metal_kargs_cpy & args,
+        device  const char * src0,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
+
+    device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
+    device       T4x4    * dst_data = (device       T4x4    *)(dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x; i00 < args.ne00/16; i00 += ntg.x) {
+        T4x4 temp;
+        dequantize_func(src_data + i00/nl, i00%nl, temp);
+        dst_data[i00] = temp;
+    }
+}
+
+typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
+
+template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
+template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
+template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
+template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
+
+template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
+template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
+template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
+template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
+
 kernel void kernel_concat(
     constant ggml_metal_kargs_concat & args,
     device  const char * src0,
@@ -4372,150 +4415,6 @@ kernel void kernel_concat(
     }
 }
 
-template<typename block_q, short qqk, void (*dequantize_func)(device const block_q *, device float *)>
-kernel void kernel_cpy_q_f32(
-    constant ggml_metal_kargs_cpy & args,
-    device const char * cx   [[ buffer(1) ]],
-    device       char * cdst [[ buffer(2) ]],
-    uint tid                 [[ thread_position_in_grid ]]
-)
-{
-    // Compute the global index multiplied by QK, matching:
-    // i = (blockDim.x*blockIdx.x + threadIdx.x)*qk
-    const int i = int(tid) * qqk;
-
-    // Bounds check
-    if (i >= args.ne) {
-        return;
-    }
-
-    const int i03 = i/(args.ne00 * args.ne01 * args.ne02);
-    const int i02 = (i - i03*args.ne00*args.ne01*args.ne02 )/ (args.ne00*args.ne01);
-    const int i01 = (i - i03*args.ne00*args.ne01*args.ne02  -  i02*args.ne01*args.ne00) / args.ne00;
-    const int i00 = i - i03*args.ne00*args.ne01*args.ne02 - i02*args.ne01*args.ne00 - i01*args.ne00;
-    const int x_offset = (i00/qqk)*args.nb00 + i01*args.nb01 + i02*args.nb02 + i03 * args.nb03;
-
-    const int i13 = i/(args.ne0 * args.ne1 * args.ne2);
-    const int i12 = (i - i13*args.ne0*args.ne1*args.ne2) / (args.ne0*args.ne1);
-    const int i11 = (i - i13*args.ne0*args.ne1*args.ne2 - i12*args.ne0*args.ne1) / args.ne0;
-    const int i10 = i - i13*args.ne0*args.ne1*args.ne2 - i12*args.ne0*args.ne1 - i11*args.ne0;
-    const int dst_offset = i10*args.nb0 + i11*args.nb1 + i12*args.nb2 + i13*args.nb3;
-
-    device const block_q * src_block = (device const block_q *)(cx + x_offset);
-    device float * dst = (device float *)(cdst + dst_offset);
-
-    dequantize_func(src_block, dst);
-}
-
-void dequant_q4_0_f(device const block_q4_0 * src_block, device float * dst) {
-    float d = float(src_block->d);
-    const float shift = 8.0f;
-
-    // Unpack 2 x 4-bit values per byte.
-    #pragma unroll(16)
-    for (int j = 0; j < QK4_0/2; j++) {
-        uint8_t q = src_block->qs[j];
-        uint8_t q0 = q & 0x0F;
-        uint8_t q1 = (q >> 4) & 0x0F;
-        dst[j]             = (float(q0) - shift) * d;
-        dst[j + QK4_0/2]   = (float(q1) - shift) * d;
-    }
-}
-
-void dequant_q4_1_f(device const block_q4_1 * src_block, device float * dst) {
-    float d = float(src_block->d);
-    float vmin = float(src_block->m);
-
-    #pragma unroll(16)
-    for (int j = 0; j < QK4_1/2; j++) {
-        uint8_t q = src_block->qs[j];
-        uint8_t q0 = q & 0x0F;
-        uint8_t q1 = (q >> 4) & 0x0F;
-        dst[j]             = vmin + d * float(q0);
-        dst[j + QK4_1/2]   = vmin + d * float(q1);
-    }
-}
-
-void dequant_q5_0_f(device const block_q5_0 * src_block, device float * dst) {
-    float d = float(src_block->d);
-    const float shift = 16.f;
-
-    // Combine the four qh bytes into a 32-bit value.
-    uint32_t qhVal = 0
-         | ((uint32_t) src_block->qh[0] <<  0)
-         | ((uint32_t) src_block->qh[1] <<  8)
-         | ((uint32_t) src_block->qh[2] << 16)
-         | ((uint32_t) src_block->qh[3] << 24);
-
-    // First half
-    #pragma unroll(16)
-    for (int j = 0; j < QK5_0/2; j++) {
-        uint8_t q = src_block->qs[j];
-        uint8_t lowNib = q & 0x0F;
-        uint8_t highBit = (qhVal >> j) & 0x1;
-        uint8_t qVal = (highBit << 4) | lowNib;
-        dst[j] = (float(qVal) - shift) * d;
-    }
-    // Second half
-    #pragma unroll(16)
-    for (int j = QK5_0/2; j < QK5_0; j++) {
-        int k = j - QK5_0/2;
-        uint8_t q = src_block->qs[k];
-        uint8_t hiNib = (q >> 4) & 0x0F;
-        uint8_t highBit = (qhVal >> j) & 0x1;
-        uint8_t qVal = (highBit << 4) | hiNib;
-        dst[j] = (float(qVal) - shift) * d;
-    }
-}
-
-void dequant_q5_1_f(device const block_q5_1 * src_block, device float * dst) {
-    float d = float(src_block->d);
-    float vmin = float(src_block->m);
-
-    uint32_t qhVal = 0
-         | ((uint32_t) src_block->qh[0] <<  0)
-         | ((uint32_t) src_block->qh[1] <<  8)
-         | ((uint32_t) src_block->qh[2] << 16)
-         | ((uint32_t) src_block->qh[3] << 24);
-
-    // First half
-    #pragma unroll(16)
-    for (int j = 0; j < QK5_1/2; j++) {
-        uint8_t q = src_block->qs[j];
-        uint8_t lowNib = q & 0x0F;
-        uint8_t highBit = (qhVal >> j) & 0x1;
-        uint8_t qVal = (highBit << 4) | lowNib;
-        dst[j] = vmin + d * float(qVal);
-    }
-    // Second half
-    #pragma unroll(16)
-    for (int j = QK5_1/2; j < QK5_1; j++) {
-        int k = j - QK5_1/2;
-        uint8_t q = src_block->qs[k];
-        uint8_t hiNib = (q >> 4) & 0x0F;
-        uint8_t highBit = (qhVal >> j) & 0x1;
-        uint8_t qVal = (highBit << 4) | hiNib;
-        dst[j] = vmin + d * float(qVal);
-    }
-}
-
-void dequant_q8_0_f(device const block_q8_0 * src_block, device float * dst) {
-    const float d = (float)src_block->d;
-
-    #pragma unroll(32)
-    for (int j = 0; j < QK8_0; j++) {
-       dst[j] = src_block->qs[j] * d;
-    }
-}
-
-typedef decltype(kernel_cpy_q_f32<block_q4_0, QK4_0, dequant_q4_0_f>) cpy_q_t;
-
-template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_t kernel_cpy_q_f32<block_q4_0, QK4_0, dequant_q4_0_f>;
-template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_t kernel_cpy_q_f32<block_q4_1, QK4_1, dequant_q4_1_f>;
-template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_t kernel_cpy_q_f32<block_q5_0, QK5_0, dequant_q5_0_f>;
-template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_t kernel_cpy_q_f32<block_q5_1, QK5_1, dequant_q5_1_f>;
-template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_t kernel_cpy_q_f32<block_q8_0, QK8_0, dequant_q8_0_f>;
-
 template<typename args_t>
 void kernel_mul_mv_q2_K_f32_impl(
         args_t args,