tunning: add f16, todo: f32 failed with CL

mqy · mqy · commit 21e9379707ca · 2023-06-18T14:27:56.000+08:00
diff --git a/ggml-tune.c b/ggml-tune.c
@@ -103,10 +103,9 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                     names[i] = ggml_mulmat_tune_task_backend_name(
                         prof->stages[i].backend);
                 }
-                printf(
-                    "\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
-                    "fastest profile: %s %s %s\n",
-                    M, N, K, names[0], names[1], names[2]);
+                printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
+                       "fastest profile: %s %s %s\n",
+                       M, N, K, names[0], names[1], names[2]);
 #endif
             }
         }
@@ -707,8 +706,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
     void *buf = malloc(sz);
 
     if (!buf) {
-        fprintf(stderr,
-                "[tune] error: failed to allocate %zu MiB memory",
+        fprintf(stderr, "[tune] error: failed to allocate %zu MiB memory",
                 sz / 1024 / 1024);
         return 0;
     }
@@ -835,8 +833,9 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                         stages_time[j] = 0;
                     }
 
-                    /*enum ggml_compute_error err = */
-                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+                    enum ggml_compute_error err = ggml_threading_compute_tensor(
+                        thrd_ctx, node, wdata, wsize);
+                    GGML_ASSERT(err == GGML_COMPUTE_OK);
 
                     for (int i = 0; i < 3; i++) {
                         int v = (int)stages_time[i];
@@ -892,11 +891,10 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                 fprintf(stdout, "[tune] data was written to `%s`\n",
                         params->fname);
             } else {
-                fprintf(
-                    stderr,
-                    "[tune] warn: failed to write file `%s`, print to "
-                    "console instead\n\n",
-                    params->fname);
+                fprintf(stderr,
+                        "[tune] warn: failed to write file `%s`, print to "
+                        "console instead\n\n",
+                        params->fname);
                 params->output_console = 1;
             }
         }
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
@@ -8,20 +8,21 @@
 static int bench(void);
 static int estimate_time_non_zero_NK(void);
 
-static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
+static void init_params(struct ggml_mulmat_tune_params *params,
+                        enum ggml_ftype ftype, int m_num, int n_threads) {
     *params = (struct ggml_mulmat_tune_params){
         .model =
             (struct ggml_mulmat_tune_model){
-                .name = "3B", // fake
-                .ftype = GGML_FTYPE_MOSTLY_Q4_0,
+                .name = "xB", // fake model name
+                .ftype = ftype,
                 .n_vocab = 4096,
                 .n_embd = 1024,
                 .n_ff = 2048,
                 .n_rot = 128,
             },
         .m_num = m_num,
         .n_pass = 1,
-        .n_threads = 1,
+        .n_threads = n_threads,
         .progress = false,
         .output_console = true,
         .fname = NULL};
@@ -45,13 +46,11 @@ int main(void) {
 }
 
 static int bench(void) {
-    printf("test: %s\n", __func__);
-
     {
         enum ggml_task_backend backends[16];
         int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
         if (n_backends < 2) {
-            printf("test: %s, skipped because no BLAS\n", __func__);
+            printf("[test-ggml-tune] skipped because no BLAS\n");
             return 0;
         }
     }
@@ -67,16 +66,48 @@ static int bench(void) {
         ggml_free(ctx);
     }
 
-    struct ggml_mulmat_tune tune;
+    // F32: ggml_opencl: ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02,
+    // NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838
+    enum ggml_ftype ftypes[] = {
+        // GGML_FTYPE_ALL_F32,
+        GGML_FTYPE_MOSTLY_F16,
+        GGML_FTYPE_MOSTLY_Q4_0,
+    };
 
-    struct ggml_mulmat_tune_params params;
+    int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
 
-    init_params(&params, /*m_num*/ 4);
+    const int m_num = 4;
 
-    bool ok = ggml_mulmat_tune_bench(&tune, &params);
-    ggml_mulmat_tune_free(&tune);
+    // Don't use n_threads larger than 2 because Github build hots has limited
+    // resource quota.
+    int threads_arr[] = {1, 2};
+    int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
+
+    int n_passed = 0;
+    int n_tests = 0;
+
+    for (int i = 0; i < n_ftypes; i++) {
+        for (int j = 0; j < thread_arr_len; j++) {
+            printf("\n");
+
+            int n_threads = threads_arr[j];
+            struct ggml_mulmat_tune tune;
+
+            struct ggml_mulmat_tune_params params;
+            memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
+            init_params(&params, ftypes[i], m_num, n_threads);
+
+            ++n_tests;
+            bool ok = ggml_mulmat_tune_bench(&tune, &params);
+            if (ok) {
+                ++n_passed;
+            }
+            ggml_mulmat_tune_free(&tune);
+        }
+    }
 
-    return ok ? 0 : 1;
+    printf("[test-ggml-tune] %d / %d passed\n", n_passed, n_tests);
+    return (n_passed == n_tests) ? 0 : 1;
 }
 
 // implement `ggml_task_profiles_provider`
@@ -93,7 +124,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
 }
 
 int estimate_time_non_zero_NK(void) {
-    printf("test: %s\n", __func__);
+    printf("test-ggml-tune: %s\n", __func__);
 
     struct test_data_t {
         int M;
@@ -106,9 +137,10 @@ int estimate_time_non_zero_NK(void) {
     };
 
     const int m_num = 2;
+    const int n_threads = 1; // useless.
 
     struct ggml_mulmat_tune_params params;
-    init_params(&params, m_num);
+    init_params(&params, tune.ftype, m_num, n_threads);
 
     ggml_mulmat_tune_init(&tune, &params, ggml_task_profiles_mock_qxx_provider);
 
@@ -123,8 +155,8 @@ int estimate_time_non_zero_NK(void) {
     GGML_ASSERT(shape->n_profiles == 2);
     GGML_ASSERT(ggml_is_quantized(shape->src0_type));
 
-    printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K,
-           shape->n_profiles);
+    printf("[test-ggml-tune] %s, shape: N: %d, K: %d, n_profiles: %d\n",
+           __func__, shape->N, shape->K, shape->n_profiles);
 
     {
         shape->items[0] =