Skip to content

Commit 48016f6

Browse files
committed
bulk refactored task profile to support complete fallback; enable tune by default for ease of dev
1 parent 1b041d7 commit 48016f6

File tree

15 files changed

+457
-371
lines changed

15 files changed

+457
-371
lines changed

CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants"
7878
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
7979
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
8080
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
81-
option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF)
81+
option(LLAMA_TUNE "llama: mulmat tune" ON)
8282

8383
#
8484
# Build info header
@@ -278,9 +278,9 @@ if (LLAMA_METAL)
278278
)
279279
endif()
280280

281-
if (LLAMA_MULMAT_TUNE)
282-
add_compile_definitions(GGML_USE_MULMAT_TUNE)
283-
add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
281+
if (LLAMA_TUNE)
282+
add_compile_definitions(GGML_USE_TUNE)
283+
add_compile_definitions(GGML_TUNE_NDEBUG)
284284
endif()
285285

286286
if (LLAMA_K_QUANTS)

Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -231,14 +231,14 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
231231
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
232232
endif
233233

234-
ifdef LLAMA_NO_K_QUANTS
234+
ifndef LLAMA_NO_K_QUANTS
235235
k_quants.o: k_quants.c k_quants.h
236236
$(CC) $(CFLAGS) -c $< -o $@
237237
endif # LLAMA_NO_K_QUANTS
238238

239-
ifdef LLAMA_MULMAT_TUNE
240-
CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
241-
CXXFLAGS += -DGGML_USE_MULMAT_TUNE
239+
ifndef LLAMA_NO_TUNE
240+
CFLAGS += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG
241+
CXXFLAGS += -DGGML_USE_TUNE
242242
endif
243243

244244
#

examples/common.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
345345
params.mem_test = true;
346346
} else if (arg == "--export") {
347347
params.export_cgraph = true;
348-
#ifdef GGML_USE_MULMAT_TUNE
348+
#ifdef GGML_USE_TUNE
349349
} else if (arg == "--tune") {
350350
params.tune = true;
351351
} else if (arg == "--tune-file") {
@@ -354,7 +354,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
354354
break;
355355
}
356356
params.tune_file = argv[i];
357-
#endif // GGML_USE_MULMAT_TUNE
357+
#endif // GGML_USE_TUNE
358358
} else if (arg == "--verbose-prompt") {
359359
params.verbose_prompt = true;
360360
} else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -508,7 +508,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
508508
#endif
509509
fprintf(stderr, " --mtest compute maximum memory usage\n");
510510
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
511-
#ifdef GGML_USE_MULMAT_TUNE
511+
#ifdef GGML_USE_TUNE
512512
fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n");
513513
fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
514514
#endif

examples/main/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
117117
return 1;
118118
}
119119

120-
#ifdef GGML_USE_MULMAT_TUNE
120+
#ifdef GGML_USE_TUNE
121121
if (params.tune || !params.tune_file.empty()) {
122122
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
123123
if (!ok || (params.tune && !params.tune_file.empty())) {

examples/mulmat-tune/README.md

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings
2323

2424
Makefile:
2525
```
26-
make clean && LLAMA_MULMAT_TUNE=1 make
26+
make clean && make
2727
```
2828

2929
CMake (with BLAS):
3030
```
3131
cmake --build . --target clean
32-
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
32+
cmake .. -DLLAMA_BLAS=ON
3333
cmake --build . --config Release
3434
```
3535

@@ -52,13 +52,13 @@ Run examples:
5252

5353
Makefile:
5454
```
55-
make clean && LLAMA_MULMAT_TUNE=1 make
55+
make clean && make
5656
```
5757

5858
CMake (with BLAS)
5959
```
6060
cmake --build . --target clean
61-
cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
61+
cmake .. -DLLAMA_BLAS=ON
6262
cmake --build . --config Release
6363
```
6464

@@ -103,22 +103,29 @@ setup properly.
103103
General steps:
104104

105105
1. run `./mulmat-tune -h` to see how to build for misc vendors.
106-
you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
106+
To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run:
107+
107108
```
108-
make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
109+
make clean; make
109110
```
111+
110112
On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
111113
with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
112-
or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
114+
or `CPU_BLAS` is more faster (as of the estimation from mulmat tune), try run
115+
with `-t 1`?
113116
2. create a small prompt file:
117+
114118
```
115119
head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
116120
```
121+
117122
3. run any of the following example commands.
123+
118124
```
119125
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
120126
./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
121127
```
128+
122129
* `--mlock` is recommended for `macOS`, you may not want to use it.
123130
* don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
124131
* `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.

examples/mulmat-tune/mulmat-tune.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,6 @@ int main(int argc, char **argv) {
262262
struct ggml_mulmat_tune_params params;
263263
memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
264264

265-
ggml_mulmat_init_task_profiles();
266-
267265
ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
268266
params.m_num = m_num;
269267
params.n_pass = n_pass;

examples/perplexity/perplexity.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
158158
return 1;
159159
}
160160

161-
#ifdef GGML_USE_MULMAT_TUNE
161+
#ifdef GGML_USE_TUNE
162162
if (params.tune || !params.tune_file.empty()){
163163
bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
164164
if (!ok || (params.tune && !params.tune_file.empty())) {

ggml-threading.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
394394
enum ggml_compute_error err =
395395
shared->task_runner(&state->params, state->node);
396396

397-
GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
397+
GGML_ASSERT(err == GGML_COMPUTE_OK);
398398

399399
ggml_spin_lock(&shared->spin);
400400

@@ -433,7 +433,11 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
433433

434434
// This is the params for main thread.
435435
struct ggml_compute_params params;
436-
enum ggml_compute_error err;
436+
enum ggml_compute_error err = GGML_COMPUTE_OK;
437+
438+
START:
439+
440+
memset(&params, 0, sizeof(struct ggml_compute_params));
437441

438442
for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
439443
if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
@@ -504,11 +508,19 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
504508
}
505509

506510
if (err != GGML_COMPUTE_OK) {
511+
if (err == GGML_COMPUTE_FALLBACK) {
512+
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
513+
int n = ggml_get_task_profiles(node, profiles);
514+
GGML_ASSERT(n > 0);
515+
memcpy(&node->task_profile, &profiles[0],
516+
sizeof(struct ggml_task_profile));
517+
goto START;
518+
}
507519
return err;
508520
}
509521
}
510522

511-
return GGML_COMPUTE_OK;
523+
return err;
512524
}
513525

514526
struct ggml_threading_context *

ggml-tune.c

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
5555

5656
struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
5757

58-
struct ggml_task_profile *prof = NULL;
58+
const struct ggml_task_profile *prof = NULL;
5959

6060
if (e->M == M && e->N == N && e->K == K) {
6161
prof = e->profile;
@@ -97,10 +97,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
9797
e->N = N;
9898
e->K = K;
9999

100-
// to disable this, build with
101-
// `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
102-
// make`
103-
#if !defined(GGML_MULMAT_TUNE_NDEBUG)
100+
#ifndef GGML_TUNE_NDEBUG
104101
const char *names[3];
105102
for (int i = 0; i < 3; i++) {
106103
names[i] = ggml_mulmat_tune_task_backend_name(
@@ -163,8 +160,8 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
163160

164161
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
165162
struct ggml_mulmat_tune_params *params,
166-
struct ggml_task_profile_factory *pf) {
167-
163+
ggml_task_profiles_provider *profiles_provider) {
164+
GGML_ASSERT(profiles_provider);
168165
struct ggml_mulmat_tune_model *model = &params->model;
169166

170167
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
@@ -208,8 +205,20 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
208205

209206
for (int i = 0; i < tune->n_shapes; i++) {
210207
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
211-
shape->n_profiles = ggml_mulmat_get_task_profiles(
212-
pf, shape->src0_type, shape->src1_type, &shape->profiles);
208+
209+
struct ggml_tensor src0 = {
210+
.type = shape->src0_type,
211+
};
212+
struct ggml_tensor src1 = {
213+
.type = shape->src1_type,
214+
};
215+
struct ggml_tensor node = {
216+
.op = GGML_OP_MUL_MAT,
217+
.src0 = &src0,
218+
.src1 = &src1,
219+
};
220+
221+
shape->n_profiles = profiles_provider(&node, shape->profiles);
213222
if (shape->n_profiles == 0) {
214223
// allowed for testing.
215224
continue;
@@ -304,9 +313,20 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
304313
for (int i = 0; i < tune->n_shapes; i++) {
305314
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
306315

307-
struct ggml_task_profile *builtin_profiles = NULL;
308-
int n_profiles = ggml_mulmat_get_task_profiles(
309-
NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
316+
struct ggml_tensor src0 = {
317+
.type = shape->src0_type,
318+
};
319+
struct ggml_tensor src1 = {
320+
.type = shape->src1_type,
321+
};
322+
struct ggml_tensor node = {
323+
.op = GGML_OP_MUL_MAT,
324+
.src0 = &src0,
325+
.src1 = &src1,
326+
};
327+
328+
struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
329+
int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
310330

311331
if (n_profiles != shape->n_profiles) {
312332
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
@@ -382,13 +402,6 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
382402
memset(shape->items, 0, item_size);
383403
}
384404

385-
{
386-
size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
387-
shape->profiles = malloc(sz);
388-
GGML_ASSERT(shape->profiles);
389-
memset(shape->profiles, 0, sz);
390-
}
391-
392405
for (int ip = 0; ip < shape->n_profiles; ip++) {
393406
struct ggml_task_profile *profile = &shape->profiles[ip];
394407
for (int j = 0; j < 3; j++) {
@@ -468,7 +481,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
468481
}
469482
}
470483

471-
struct ggml_task_profile *profile = &shape->profiles[ip];
484+
const struct ggml_task_profile *profile = &shape->profiles[ip];
472485
for (int k = 0; k < 3; k++) {
473486
if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
474487
rc = fprintf(fp, "%9d", item->stages_time[k]);
@@ -537,7 +550,7 @@ void ggml_mulmat_tune_estimate_time(
537550
const int max_m = shape->items[m_num - 1].M;
538551

539552
for (int ip = 0; ip < shape->n_profiles; ip++) {
540-
struct ggml_task_profile *profile = &shape->profiles[ip];
553+
const struct ggml_task_profile *profile = &shape->profiles[ip];
541554
profile_time[ip].total_time = 0;
542555
profile_time[ip].profile = profile;
543556

@@ -573,7 +586,7 @@ void ggml_mulmat_tune_estimate_time(
573586
GGML_ASSERT(p0 && p1);
574587

575588
for (int i_stage = 0; i_stage < 3; i_stage++) {
576-
struct ggml_task_stage *stage = &profile->stages[i_stage];
589+
const struct ggml_task_stage *stage = &profile->stages[i_stage];
577590
if (stage->backend == GGML_TASK_BACKEND_NONE) {
578591
continue;
579592
}
@@ -736,7 +749,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
736749
return false;
737750
}
738751

739-
bool ok = ggml_mulmat_tune_init(tune, params, NULL);
752+
bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
740753
if (!ok) {
741754
return false;
742755
}

ggml-tune.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ struct ggml_mulmat_tune_shape {
4646
enum ggml_type src1_type;
4747

4848
int n_profiles;
49-
struct ggml_task_profile *profiles;
49+
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
5050

5151
int m_num;
5252
int *arr_m;
@@ -69,7 +69,7 @@ struct ggml_mulmat_tune {
6969
};
7070

7171
struct ggml_mulmat_tune_time {
72-
struct ggml_task_profile *profile;
72+
const struct ggml_task_profile *profile;
7373
int stage_time[3];
7474
int total_time;
7575
};
@@ -78,7 +78,7 @@ struct mm_cache_element {
7878
int M;
7979
int N;
8080
int K;
81-
struct ggml_task_profile *profile;
81+
const struct ggml_task_profile *profile;
8282
int stages_time[3];
8383
};
8484

@@ -108,7 +108,7 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
108108

109109
bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
110110
struct ggml_mulmat_tune_params *params,
111-
struct ggml_task_profile_factory *profile_factory);
111+
ggml_task_profiles_provider *profiles_provider);
112112

113113
void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
114114

0 commit comments

Comments
 (0)