ggml-org
diff --git a/‎examples/mulmat-tune/mulmat-tune.cpp
Lines changed: 23 additions & 4 deletions b/‎examples/mulmat-tune/mulmat-tune.cpp
Lines changed: 23 additions & 4 deletions
diff --git a/‎ggml-opencl.cpp
Lines changed: 1 addition & 1 deletion b/‎ggml-opencl.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml-threading.c
Lines changed: 28 additions & 9 deletions b/‎ggml-threading.c
Lines changed: 28 additions & 9 deletions
diff --git a/‎ggml-threading.h
Lines changed: 8 additions & 14 deletions b/‎ggml-threading.h
Lines changed: 8 additions & 14 deletions
diff --git a/‎ggml-tune.c
Lines changed: 55 additions & 15 deletions b/‎ggml-tune.c
Lines changed: 55 additions & 15 deletions
@@ -11,6 +11,10 @@
 
 #define UNUSED(x) (void)(x)
 
+// F16 has an pending Illegal Instruction error on macos-latest-cmake.
+// So the workaround is to disable non-quantized ftypes.
+// #define SUPPORT_NONE_Q_TYPE 1
+
 static void print_build_tips(void) {
     const char *a = "LLAMA_NO_ACCELERATE";
     fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
@@ -62,11 +66,12 @@ static void usage(char *prog) {
         "--model     MODEL    3B | 7B | 13B | 30B | 65B",
         "                     default 7B",
         "--ftype     FTYPE    ggml ftype:",
+#ifdef SUPPORT_NONE_Q_TYPE
         "                     0:  all F32",
         "                     1:  mostly F16",
+#endif
         "                     2:  mostly Q4_0",
         "                     3:  mostly Q4_1",
-        "                     4:  mostly Q4_1, some F16",
         "                     7:  mostly Q8_0",
         "                     8:  mostly Q5_0",
         "                     9:  mostly Q5_1",
@@ -84,7 +89,7 @@ static void usage(char *prog) {
         "                     requires: between [1, 3]",
         "--n_threads NTH      bench with this number of threads",
         "                     requires: between [1, 16]",
-        "                     default 1",
+        "                     default 4",
         "--file      FILE     data file to write",
         "                     default stdout",
         "-y                   always answer \"yes\" to all prompts",
@@ -170,8 +175,22 @@ int main(int argc, char **argv) {
             ftype = (enum ggml_ftype)v;
         }
 
+#ifndef SUPPORT_NONE_Q_TYPE
         if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
-            fprintf(stderr, "none quantized type %d is not supported\n", ftype);
+            fprintf(stderr, "error: none quantized type %d is not supported\n",
+                    ftype);
+            return 1;
+        }
+#endif
+
+        bool cond_1 = ftype >= GGML_FTYPE_MOSTLY_Q4_0 &&
+                      ftype <= GGML_FTYPE_MOSTLY_Q4_1;
+        bool cond_2 =
+            ftype >= GGML_FTYPE_MOSTLY_Q8_0 && ftype <= GGML_FTYPE_MOSTLY_Q6_K;
+
+        if (!(cond_1 || cond_2)) {
+            fprintf(stderr, "error: type %d is not a known ggml ftype.\n",
+                    ftype);
             return 1;
         }
     }
@@ -223,7 +242,7 @@ int main(int argc, char **argv) {
         }
     }
 
-    int n_threads = 1;
+    int n_threads = 4;
     {
         if (arg_n_threads != NULL) {
             int v = atoi(arg_n_threads);
 
@@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
 }
 
 void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
 
     if (src0->type == GGML_TYPE_F32) {
         ggml_cl_mul_mat_f32(src0, src1, dst);
 
@@ -170,7 +170,8 @@ struct ggml_compute_state_shared {
     atomic_bool wait_on_done;
     atomic_bool stop;
 
-    ggml_threading_task_runner *task_runner;
+    // Default task runner, can be overriden by node.task_profile.runner.
+    ggml_task_runner *task_runner;
 
     struct ggml_threading_context *ctx;
 };
@@ -391,8 +392,10 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
         }
 
         if (shared->n_tasks > 0 && state->has_work) {
-            enum ggml_compute_error err =
-                shared->task_runner(&state->params, state->node);
+            ggml_task_runner *runner = state->node->task_profile.runner
+                                           ? state->node->task_profile.runner
+                                           : shared->task_runner;
+            enum ggml_compute_error err = runner(&state->params, state->node);
 
             GGML_ASSERT(err == GGML_COMPUTE_OK);
 
@@ -427,8 +430,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                               size_t wsize) {
     GGML_ASSERT(ctx);
     GGML_ASSERT(node);
-
     GGML_ASSERT(ctx->shared.task_runner);
+
+    ggml_task_runner *runner = ctx->shared.task_runner;
+    if (node->task_profile.runner) {
+        runner = node->task_profile.runner;
+    }
+
     struct ggml_compute_state_shared *state_shared = &ctx->shared;
 
     // This is the params for main thread.
@@ -491,7 +499,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
             params.wsize = wsize;
             params.wdata = wdata;
 
-            err = state_shared->task_runner(&params, node);
+            err = runner(&params, node);
         }
 
         // wait for tasks done.
@@ -509,11 +517,21 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
         if (err != GGML_COMPUTE_OK) {
             if (err == GGML_COMPUTE_FALLBACK) {
+                PRINT_DEBUG("[main] fallback from profile, id=%d\n",
+                            node->task_profile.id);
+                GGML_ASSERT(node->task_profile.stages[1].backend >
+                            GGML_TASK_BACKEND_CPU);
+
                 struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
                 int n = ggml_get_task_profiles(node, profiles);
                 GGML_ASSERT(n > 0);
+                GGML_ASSERT(profiles[0].stages[1].backend ==
+                            GGML_TASK_BACKEND_CPU);
+
                 memcpy(&node->task_profile, &profiles[0],
                        sizeof(struct ggml_task_profile));
+                runner = ctx->shared.task_runner;
+
                 goto START;
             }
             return err;
@@ -525,12 +543,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
 struct ggml_threading_context *
 ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
-                     ggml_threading_task_runner *task_stage_runner,
+                     ggml_task_runner *task_runner,
                      enum ggml_threading_features features,
                      int64_t stages_time[3]) {
     GGML_ASSERT(n_threads > 0);
-    GGML_ASSERT(thread_runner);
-    GGML_ASSERT(task_stage_runner);
+    if (thread_runner == NULL) {
+        thread_runner = ggml_threading_graph_compute_thread;
+    }
 
     size_t ctx_sz = sizeof(struct ggml_threading_context);
     struct ggml_threading_context *ctx = malloc(ctx_sz);
@@ -545,7 +564,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
         .wait_now = false,
         .wait_on_done = false,
         .stop = false,
-        .task_runner = task_stage_runner,
+        .task_runner = task_runner,
         .ctx = ctx,
     };
 
 
@@ -21,27 +21,21 @@ enum ggml_threading_features {
     GGML_THREADING_FEATURE_PERF = 1 << 1,
 };
 
-// Compute errors.
-enum ggml_compute_error {
-    GGML_COMPUTE_OK = 0,
-    GGML_COMPUTE_FALLBACK = 1,
-};
-
-// The task runner to be called by main thread and workers.
-typedef enum ggml_compute_error(ggml_threading_task_runner)(
-    struct ggml_compute_params *params, struct ggml_tensor *node);
-
 // The thread runner to feed into OS threads.
 typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 
 // Init and start underlying workers if n_threads > 1.
 //
-// features: optional for configure threading additional features.
-// see `ggml_threading_feature`, default 0.
+// thread: optional OS thread runner, default value:
+// `ggml_threading_graph_compute_thread`.
+//
+// features: optional for configure
+// threading additional features. see `ggml_threading_feature`, default 0.
+//
 // stages_time: optional for collecting per-stage wall clock time.
 struct ggml_threading_context *
 ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
-                     ggml_threading_task_runner *task_stage_runner,
+                     ggml_task_runner *task_runner,
                      enum ggml_threading_features features,
                      int64_t stages_time[3]);
 
@@ -60,7 +54,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
 // This is an experimental functionality for mulmat tune, as a thin wrapper.
 enum ggml_compute_error
-ggml_compute_forward_wrapper(struct ggml_compute_params *params,
+ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
                              struct ggml_tensor *tensor);
 
 #ifdef __cplusplus
 
@@ -44,9 +44,12 @@ ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
     }
 }
 
-const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
-    struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
-    enum ggml_type src1_t, int stages_time[3]) {
+// NOTE: we can not use the profile from tune because the profiles do not
+// contain fields such as runner, get_size.
+int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
+                                         int N, int K, enum ggml_type src0_t,
+                                         enum ggml_type src1_t,
+                                         int stages_time[3]) {
     GGML_ASSERT(tune);
 
     // TODO: default_mm_cache is thread-unsafe.
@@ -103,15 +106,15 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                     names[i] = ggml_mulmat_tune_task_backend_name(
                         prof->stages[i].backend);
                 }
-                printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
-                       "fastest profile: %s %s %s\n",
-                       M, N, K, names[0], names[1], names[2]);
+                printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
+                       "backends: %s %s %s\n",
+                       M, N, K, prof->id, names[0], names[1], names[2]);
 #endif
             }
         }
     }
 
-    return prof;
+    return prof->id;
 }
 
 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@@ -264,10 +267,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
         if (shape->m_num > 0) {
             if (shape->arr_m) {
                 free(shape->arr_m);
+                shape->arr_m = NULL;
             }
             if (shape->items) {
                 free(shape->items);
+                shape->items = NULL;
             }
+            shape->m_num = 0;
         }
     }
 }
@@ -277,6 +283,11 @@ static bool ggml_mulmat_tune_write_profiles(
     int rc;
     for (int i = 0; i < n_profiles; i++) {
         const struct ggml_task_profile *profile = &profiles[i];
+        rc = fprintf(fp, "%d ", profile->id);
+        if (rc <= 0) {
+            return false;
+        }
+
         for (int j = 0; j < 3; j++) {
             const struct ggml_task_stage *ts = &profile->stages[j];
             rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
@@ -304,7 +315,6 @@ static bool
 ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
                                    const char *model, int ftype, int n_threads,
                                    char *errbuf, int errbuf_len) {
-
     if (tune->version != GGML_MULMAT_TUNE_VERSION) {
         snprintf(errbuf, errbuf_len - 1,
                  "version mismatch, built-in: %d, "
@@ -348,14 +358,28 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
         int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
 
         if (n_profiles != shape->n_profiles) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)");
+            snprintf(errbuf, errbuf_len - 1,
+                     "task profiles mismatch (n_profiles)");
             return false;
         }
 
         // TODO: profiles order is relevant, too strict.
-        size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
-        if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)");
+        // Only validate stages!
+        size_t sz = sizeof(struct ggml_task_stage) * 3;
+        bool matched = true;
+        for (int j = 0; j < n_profiles; j++) {
+            if (builtin_profiles[j].id != shape->profiles[j].id) {
+                return false;
+            }
+            if (memcmp(builtin_profiles[j].stages, shape->profiles[j].stages,
+                       sz) != 0) {
+                matched = false;
+                break;
+            }
+        }
+        if (!matched) {
+            snprintf(errbuf, errbuf_len - 1,
+                     "task profiles mismatch (profiles)");
 
             printf("=== built-in profiles:\n");
             ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
@@ -426,6 +450,12 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
 
         for (int ip = 0; ip < shape->n_profiles; ip++) {
             struct ggml_task_profile *profile = &shape->profiles[ip];
+
+            rc = fscanf(fp, "%d ", &profile->id);
+            if (rc <= 0) {
+                return false;
+            }
+
             for (int j = 0; j < 3; j++) {
                 struct ggml_task_stage *ts = &profile->stages[j];
                 int backend;
@@ -777,6 +807,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
     GGML_ASSERT(params);
     GGML_ASSERT(params->model.name);
 
+    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
+
     enum ggml_task_backend backends[16];
     int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
     if (n_backends < 2) {
@@ -785,6 +817,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         return false;
     }
 
+    if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
+        params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
+#if defined(GGML_USE_CLBLAST)
+        printf("[tune] error: cl implementation does not support k_quants at "
+               "the time of writing this code, skip.\n");
+        return false;
+#endif
+    }
+
     bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
     if (!ok) {
         return false;
@@ -816,9 +857,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
     int64_t t0 = ggml_time_ms();
 
     struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        tune->n_threads, ggml_threading_graph_compute_thread,
-        ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
-        stages_time);
+        tune->n_threads, NULL, ggml_compute_forward_wrapper,
+        GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
Original file line number	Diff line number	Diff line change
`@@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_`
`1628`	`1628`	`}`
`1629`	`1629`
`1630`	`1630`	`void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {`
`1631`		`- // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));`
	`1631`	`+ GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));`
`1632`	`1632`
`1633`	`1633`	`if (src0->type == GGML_TYPE_F32) {`
`1634`	`1634`	`ggml_cl_mul_mat_f32(src0, src1, dst);`