bulk refactored task profile to support complete fallback; enable tune by default for ease of dev

mqy · mqy · commit 48016f685c93 · 2023-06-18T14:27:56.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -78,7 +78,7 @@ option(LLAMA_K_QUANTS                        "llama: use k-quants"
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)
-option(LLAMA_MULMAT_TUNE                "llama: mulmat tune"                                    OFF)
+option(LLAMA_TUNE                       "llama: mulmat tune"                                    ON)
 
 #
 # Build info header
@@ -278,9 +278,9 @@ if (LLAMA_METAL)
         )
 endif()
 
-if (LLAMA_MULMAT_TUNE)
-    add_compile_definitions(GGML_USE_MULMAT_TUNE)
-    add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
+if (LLAMA_TUNE)
+    add_compile_definitions(GGML_USE_TUNE)
+    add_compile_definitions(GGML_TUNE_NDEBUG)
 endif()
 
 if (LLAMA_K_QUANTS)
diff --git a/Makefile b/Makefile
@@ -231,14 +231,14 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
-ifdef LLAMA_NO_K_QUANTS
+ifndef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 
-ifdef LLAMA_MULMAT_TUNE
-	CFLAGS   += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
-	CXXFLAGS += -DGGML_USE_MULMAT_TUNE
+ifndef LLAMA_NO_TUNE
+CFLAGS   += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG
+CXXFLAGS += -DGGML_USE_TUNE
 endif
 
 #
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -345,7 +345,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.mem_test = true;
         } else if (arg == "--export") {
             params.export_cgraph = true;
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
         } else if (arg == "--tune") {
             params.tune = true;
         } else if (arg == "--tune-file") {
@@ -354,7 +354,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.tune_file = argv[i];
-#endif // GGML_USE_MULMAT_TUNE
+#endif // GGML_USE_TUNE
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -508,7 +508,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     fprintf(stderr, "  --tune                mulmat tune enable. If tune-file is set then exit after bench\n");
     fprintf(stderr, "  --tune-file FILE      mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
 #endif
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     if (params.tune || !params.tune_file.empty()) {
         bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
         if (!ok || (params.tune && !params.tune_file.empty())) {
diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md
@@ -23,13 +23,13 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings
 
 Makefile:
 ```
-make clean && LLAMA_MULMAT_TUNE=1 make
+make clean && make
 ```
 
 CMake (with BLAS):
 ```
 cmake --build . --target clean
-cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake .. -DLLAMA_BLAS=ON
 cmake --build . --config Release
 ```
 
@@ -52,13 +52,13 @@ Run examples:
 
 Makefile:
 ```
-make clean && LLAMA_MULMAT_TUNE=1 make
+make clean && make
 ```
 
 CMake (with BLAS)
 ```
 cmake --build . --target clean
-cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake .. -DLLAMA_BLAS=ON
 cmake --build . --config Release
 ```
 
@@ -103,22 +103,29 @@ setup properly.
 General steps:
 
 1. run `./mulmat-tune -h` to see how to build for misc vendors.
-   you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
+   To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run:
+
    ```
-   make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
+   make clean; make
    ```
+
    On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
    with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
-   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
+   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune), try run
+   with `-t 1`?
 2. create a small prompt file:
+
    ```
    head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
    ```
+
 3. run any of the following example commands.
+
    ```
    ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
    ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
    ```
+
    * `--mlock` is recommended for `macOS`, you may not want to use it.
    * don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
    * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
@@ -262,8 +262,6 @@ int main(int argc, char **argv) {
     struct ggml_mulmat_tune_params params;
     memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
 
-    ggml_mulmat_init_task_profiles();
-
     ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
     params.m_num = m_num;
     params.n_pass = n_pass;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     if (params.tune || !params.tune_file.empty()){
         bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
         if (!ok || (params.tune && !params.tune_file.empty())) {
diff --git a/ggml-threading.c b/ggml-threading.c
@@ -394,7 +394,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
             enum ggml_compute_error err =
                 shared->task_runner(&state->params, state->node);
 
-            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
+            GGML_ASSERT(err == GGML_COMPUTE_OK);
 
             ggml_spin_lock(&shared->spin);
 
@@ -433,7 +433,11 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
     // This is the params for main thread.
     struct ggml_compute_params params;
-    enum ggml_compute_error err;
+    enum ggml_compute_error err = GGML_COMPUTE_OK;
+
+START:
+
+    memset(&params, 0, sizeof(struct ggml_compute_params));
 
     for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
         if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
@@ -504,11 +508,19 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
         }
 
         if (err != GGML_COMPUTE_OK) {
+            if (err == GGML_COMPUTE_FALLBACK) {
+                struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+                int n = ggml_get_task_profiles(node, profiles);
+                GGML_ASSERT(n > 0);
+                memcpy(&node->task_profile, &profiles[0],
+                       sizeof(struct ggml_task_profile));
+                goto START;
+            }
             return err;
         }
     }
 
-    return GGML_COMPUTE_OK;
+    return err;
 }
 
 struct ggml_threading_context *
diff --git a/ggml-tune.c b/ggml-tune.c
@@ -55,7 +55,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
 
     struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
 
-    struct ggml_task_profile *prof = NULL;
+    const struct ggml_task_profile *prof = NULL;
 
     if (e->M == M && e->N == N && e->K == K) {
         prof = e->profile;
@@ -97,10 +97,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                 e->N = N;
                 e->K = K;
 
-                // to disable this, build with
-                // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
-                // make`
-#if !defined(GGML_MULMAT_TUNE_NDEBUG)
+#ifndef GGML_TUNE_NDEBUG
                 const char *names[3];
                 for (int i = 0; i < 3; i++) {
                     names[i] = ggml_mulmat_tune_task_backend_name(
@@ -163,8 +160,8 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
 
 bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params,
-                           struct ggml_task_profile_factory *pf) {
-
+                           ggml_task_profiles_provider *profiles_provider) {
+    GGML_ASSERT(profiles_provider);
     struct ggml_mulmat_tune_model *model = &params->model;
 
     memset(tune, 0, sizeof(struct ggml_mulmat_tune));
@@ -208,8 +205,20 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
 
     for (int i = 0; i < tune->n_shapes; i++) {
         struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
-        shape->n_profiles = ggml_mulmat_get_task_profiles(
-            pf, shape->src0_type, shape->src1_type, &shape->profiles);
+
+        struct ggml_tensor src0 = {
+            .type = shape->src0_type,
+        };
+        struct ggml_tensor src1 = {
+            .type = shape->src1_type,
+        };
+        struct ggml_tensor node = {
+            .op = GGML_OP_MUL_MAT,
+            .src0 = &src0,
+            .src1 = &src1,
+        };
+
+        shape->n_profiles = profiles_provider(&node, shape->profiles);
         if (shape->n_profiles == 0) {
             // allowed for testing.
             continue;
@@ -304,9 +313,20 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
     for (int i = 0; i < tune->n_shapes; i++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
 
-        struct ggml_task_profile *builtin_profiles = NULL;
-        int n_profiles = ggml_mulmat_get_task_profiles(
-            NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
+        struct ggml_tensor src0 = {
+            .type = shape->src0_type,
+        };
+        struct ggml_tensor src1 = {
+            .type = shape->src1_type,
+        };
+        struct ggml_tensor node = {
+            .op = GGML_OP_MUL_MAT,
+            .src0 = &src0,
+            .src1 = &src1,
+        };
+
+        struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
+        int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
 
         if (n_profiles != shape->n_profiles) {
             snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
@@ -382,13 +402,6 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
             memset(shape->items, 0, item_size);
         }
 
-        {
-            size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
-            shape->profiles = malloc(sz);
-            GGML_ASSERT(shape->profiles);
-            memset(shape->profiles, 0, sz);
-        }
-
         for (int ip = 0; ip < shape->n_profiles; ip++) {
             struct ggml_task_profile *profile = &shape->profiles[ip];
             for (int j = 0; j < 3; j++) {
@@ -468,7 +481,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
                     }
                 }
 
-                struct ggml_task_profile *profile = &shape->profiles[ip];
+                const struct ggml_task_profile *profile = &shape->profiles[ip];
                 for (int k = 0; k < 3; k++) {
                     if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
                         rc = fprintf(fp, "%9d", item->stages_time[k]);
@@ -537,7 +550,7 @@ void ggml_mulmat_tune_estimate_time(
     const int max_m = shape->items[m_num - 1].M;
 
     for (int ip = 0; ip < shape->n_profiles; ip++) {
-        struct ggml_task_profile *profile = &shape->profiles[ip];
+        const struct ggml_task_profile *profile = &shape->profiles[ip];
         profile_time[ip].total_time = 0;
         profile_time[ip].profile = profile;
 
@@ -573,7 +586,7 @@ void ggml_mulmat_tune_estimate_time(
         GGML_ASSERT(p0 && p1);
 
         for (int i_stage = 0; i_stage < 3; i_stage++) {
-            struct ggml_task_stage *stage = &profile->stages[i_stage];
+            const struct ggml_task_stage *stage = &profile->stages[i_stage];
             if (stage->backend == GGML_TASK_BACKEND_NONE) {
                 continue;
             }
@@ -736,7 +749,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         return false;
     }
 
-    bool ok = ggml_mulmat_tune_init(tune, params, NULL);
+    bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
     if (!ok) {
         return false;
     }
diff --git a/ggml-tune.h b/ggml-tune.h
@@ -46,7 +46,7 @@ struct ggml_mulmat_tune_shape {
     enum ggml_type src1_type;
 
     int n_profiles;
-    struct ggml_task_profile *profiles;
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
 
     int m_num;
     int *arr_m;
@@ -69,7 +69,7 @@ struct ggml_mulmat_tune {
 };
 
 struct ggml_mulmat_tune_time {
-    struct ggml_task_profile *profile;
+    const struct ggml_task_profile *profile;
     int stage_time[3];
     int total_time;
 };
@@ -78,7 +78,7 @@ struct mm_cache_element {
     int M;
     int N;
     int K;
-    struct ggml_task_profile *profile;
+    const struct ggml_task_profile *profile;
     int stages_time[3];
 };
 
@@ -108,7 +108,7 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
 
 bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params,
-                           struct ggml_task_profile_factory *profile_factory);
+                           ggml_task_profiles_provider *profiles_provider);
 
 void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
 
diff --git a/ggml.c b/ggml.c
diff --git a/ggml.h b/ggml.h
diff --git a/llama.cpp b/llama.cpp
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ int main(int argc, char ** argv) {`
`117`	`117`	`return 1;`
`118`	`118`	`}`
`119`	`119`
`120`		`-#ifdef GGML_USE_MULMAT_TUNE`
	`120`	`+#ifdef GGML_USE_TUNE`
`121`	`121`	`if (params.tune \|\| !params.tune_file.empty()) {`
`122`	`122`	`bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());`
`123`	`123`	`if (!ok \|\| (params.tune && !params.tune_file.empty())) {`
Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {`
`158`	`158`	`return 1;`
`159`	`159`	`}`
`160`	`160`
`161`		`-#ifdef GGML_USE_MULMAT_TUNE`
	`161`	`+#ifdef GGML_USE_TUNE`
`162`	`162`	`if (params.tune \|\| !params.tune_file.empty()){`
`163`	`163`	`bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());`
`164`	`164`	`if (!ok \|\| (params.tune && !params.tune_file.empty())) {`