tunning: support k_quants; disabled rope shapes (workaround); make cache thread safe; fixed shape comprison

mqy · mqy · commit 5342dc075ff1 · 2023-06-18T14:27:56.000+08:00
diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
@@ -170,8 +170,8 @@ int main(int argc, char **argv) {
             ftype = (enum ggml_ftype)v;
         }
 
-        if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {
-            fprintf(stderr, "k_quants type %d is not implemented\n", ftype);
+        if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
+            fprintf(stderr, "none quantized type %d is not supported\n", ftype);
             return 1;
         }
     }
diff --git a/ggml-tune.c b/ggml-tune.c
@@ -4,10 +4,11 @@
 #include "ggml-tune.h"
 #include "ggml.h"
 
-// MUL_MAT fine tunning for non-GPU-offloading cases.
+#ifdef GGML_USE_K_QUANTS
+#include "k_quants.h"
+#endif
 
-#define GGML_MULMAT_CACHE_LEN 16
-static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0};
+// MUL_MAT fine tunning for non-GPU-offloading cases.
 
 #define FNV_OFFSET 14695981039346656037UL
 #define FNV_PRIME 1099511628211UL
@@ -49,9 +50,8 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
     GGML_ASSERT(tune);
 
     // TODO: default_mm_cache is thread-unsafe.
-    struct mm_cache_element *mm_cache = default_mm_cache;
     int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
-    struct mm_cache_element *e = &mm_cache[slot];
+    struct ggml_mulmat_tune_cache_ele *e = &tune->cache[slot];
 
     struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
 
@@ -183,7 +183,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
 
     enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);
 
-    GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6);
+    GGML_ASSERT(GGML_MULMAT_N_SHAPES == 4 || GGML_MULMAT_N_SHAPES == 6);
     tune->n_shapes = GGML_MULMAT_N_SHAPES;
 
     // Attention layers
@@ -196,11 +196,26 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
         .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
     tune->shapes[3] = (struct ggml_mulmat_tune_shape){
         .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};
-    // RoPE
-    tune->shapes[4] = (struct ggml_mulmat_tune_shape){
-        .N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type};
-    tune->shapes[5] = (struct ggml_mulmat_tune_shape){
-        .N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type};
+
+    tune->n_shapes = GGML_MULMAT_N_SHAPES;
+
+    if (GGML_MULMAT_N_SHAPES == 6) {
+        // RoPE.
+        // - very small comparing to previous, almost no need to bench.
+        // - an Illegal instruction exception on Github (mac-latest-cmake).
+        // - CL sometimes throws error on localhost.
+        // So temporarily disabled as a workaround.
+        tune->shapes[4] =
+            (struct ggml_mulmat_tune_shape){.N = n_rot,
+                                            .K = 0,
+                                            .src0_type = rot_src0_type,
+                                            .src1_type = src1_type};
+        tune->shapes[5] =
+            (struct ggml_mulmat_tune_shape){.N = 0,
+                                            .K = n_rot,
+                                            .src0_type = rot_src0_type,
+                                            .src1_type = src1_type};
+    }
 
     for (int i = 0; i < tune->n_shapes; i++) {
         struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
@@ -225,6 +240,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
 
         shape->m_num = params->m_num;
         shape->arr_m = malloc(shape->m_num * sizeof(int));
+        GGML_ASSERT(shape->arr_m);
         for (int j = 0; j < shape->m_num; j++) {
             shape->arr_m[j] = 1 << j;
         }
@@ -245,11 +261,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
         GGML_ASSERT(shape);
 
         // arr_m and items can be NULL only when testing.
-        if (shape->arr_m) {
-            free(shape->arr_m);
-        }
-        if (shape->items) {
-            free(shape->items);
+        if (shape->m_num > 0) {
+            if (shape->arr_m) {
+                free(shape->arr_m);
+            }
+            if (shape->items) {
+                free(shape->items);
+            }
         }
     }
 }
@@ -325,17 +343,19 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
         };
 
         struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
+        memset(builtin_profiles, 0, sizeof(builtin_profiles));
+
         int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
 
         if (n_profiles != shape->n_profiles) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
+            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)");
             return false;
         }
 
         // TODO: profiles order is relevant, too strict.
         size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
         if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
+            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)");
 
             printf("=== built-in profiles:\n");
             ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
@@ -364,6 +384,9 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
 }
 
 bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
+    GGML_ASSERT(tune);
+    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
+
     int rc = fscanf(fp, "%d", &tune->version);
     if (rc <= 0) {
         return false;
@@ -661,27 +684,42 @@ static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
             ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
         ggml_set_f32(src0_f32, 0.1f);
 
+        const float *src_data = (const float *)src0_f32->data;
+        int nxk = N * K;
+
         switch (src0_type) {
         case GGML_TYPE_Q4_0:
-            ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q4_0(src_data, src0->data, nxk, K, hist);
             break;
         case GGML_TYPE_Q4_1:
-            ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q4_1(src_data, src0->data, nxk, K, hist);
             break;
         case GGML_TYPE_Q5_0:
-            ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q5_0(src_data, src0->data, nxk, K, hist);
             break;
         case GGML_TYPE_Q5_1:
-            ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q5_1(src_data, src0->data, nxk, K, hist);
             break;
         case GGML_TYPE_Q8_0:
-            ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q8_0(src_data, src0->data, nxk, K, hist);
+            break;
+#ifdef GGML_USE_K_QUANTS
+        case GGML_TYPE_Q2_K:
+            ggml_quantize_q2_K(src_data, src0->data, nxk, K, hist);
             break;
+        case GGML_TYPE_Q3_K:
+            ggml_quantize_q3_K(src_data, src0->data, nxk, K, hist);
+            break;
+        case GGML_TYPE_Q4_K:
+            ggml_quantize_q4_K(src_data, src0->data, nxk, K, hist);
+            break;
+        case GGML_TYPE_Q5_K:
+            ggml_quantize_q5_K(src_data, src0->data, nxk, K, hist);
+            break;
+        case GGML_TYPE_Q6_K:
+            ggml_quantize_q6_K(src_data, src0->data, nxk, K, hist);
+            break;
+#endif
         default:
             GGML_ASSERT(false);
         }
diff --git a/ggml-tune.h b/ggml-tune.h
@@ -11,7 +11,8 @@ extern "C" {
 #endif
 
 #define GGML_MULMAT_TUNE_VERSION 8
-#define GGML_MULMAT_N_SHAPES 6
+#define GGML_MULMAT_N_SHAPES 4
+#define GGML_MULMAT_CACHE_LEN 16
 
 #define GGML_MULMAT_MAX_PASS 3
 
@@ -54,6 +55,14 @@ struct ggml_mulmat_tune_shape {
     struct ggml_mulmat_tune_m *items;
 };
 
+ struct ggml_mulmat_tune_cache_ele {
+    int M;
+    int N;
+    int K;
+    const struct ggml_task_profile *profile;
+    int stages_time[3];
+};
+
 struct ggml_mulmat_tune {
     int version;
 
@@ -66,6 +75,9 @@ struct ggml_mulmat_tune {
     struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
 
     int n_threads;
+
+    // Cache for time estimating.
+    struct ggml_mulmat_tune_cache_ele cache[GGML_MULMAT_CACHE_LEN];
 };
 
 struct ggml_mulmat_tune_time {
@@ -74,14 +86,6 @@ struct ggml_mulmat_tune_time {
     int total_time;
 };
 
-struct mm_cache_element {
-    int M;
-    int N;
-    int K;
-    const struct ggml_task_profile *profile;
-    int stages_time[3];
-};
-
 // params for tune/bench.
 struct ggml_mulmat_tune_params {
     struct ggml_mulmat_tune_model model;
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
@@ -70,15 +70,16 @@ static int bench(void) {
     // NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838
     enum ggml_ftype ftypes[] = {
         // GGML_FTYPE_ALL_F32,
-        GGML_FTYPE_MOSTLY_F16,
+        // GGML_FTYPE_MOSTLY_F16,
         GGML_FTYPE_MOSTLY_Q4_0,
+        GGML_FTYPE_MOSTLY_Q4_K,
     };
 
     int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
 
     const int m_num = 4;
 
-    // Don't use n_threads larger than 2 because Github build hots has limited
+    // Don't use n_threads larger than 2 because Github build hosts has limited
     // resource quota.
     int threads_arr[] = {1, 2};
     int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
@@ -124,7 +125,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
 }
 
 int estimate_time_non_zero_NK(void) {
-    printf("test-ggml-tune: %s\n", __func__);
+    printf("[test-ggml-tune] %s\n", __func__);
 
     struct test_data_t {
         int M;

Original file line number	Diff line number	Diff line change
`@@ -170,8 +170,8 @@ int main(int argc, char **argv) {`
`170`	`170`	`ftype = (enum ggml_ftype)v;`
`171`	`171`	`}`
`172`	`172`
`173`		`- if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {`
`174`		`- fprintf(stderr, "k_quants type %d is not implemented\n", ftype);`
	`173`	`+ if (ftype == GGML_FTYPE_ALL_F32 \|\| ftype == GGML_FTYPE_MOSTLY_F16) {`
	`174`	`+ fprintf(stderr, "none quantized type %d is not supported\n", ftype);`
`175`	`175`	`return 1;`
`176`	`176`	`}`
`177`	`177`	`}`