Skip to content

Commit 6b83a3e

Browse files
committed
try make CL run w/o tunning, but -ngl stucks no output. had to add task runer and profile id, many changes, see the f codes
1 parent 5342dc0 commit 6b83a3e

File tree

10 files changed

+433
-279
lines changed

10 files changed

+433
-279
lines changed

examples/mulmat-tune/mulmat-tune.cpp

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111

1212
#define UNUSED(x) (void)(x)
1313

14+
// F16 has an pending Illegal Instruction error on macos-latest-cmake.
15+
// So the workaround is to disable non-quantized ftypes.
16+
// #define SUPPORT_NONE_Q_TYPE 1
17+
1418
static void print_build_tips(void) {
1519
const char *a = "LLAMA_NO_ACCELERATE";
1620
fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
@@ -62,11 +66,12 @@ static void usage(char *prog) {
6266
"--model MODEL 3B | 7B | 13B | 30B | 65B",
6367
" default 7B",
6468
"--ftype FTYPE ggml ftype:",
69+
#ifdef SUPPORT_NONE_Q_TYPE
6570
" 0: all F32",
6671
" 1: mostly F16",
72+
#endif
6773
" 2: mostly Q4_0",
6874
" 3: mostly Q4_1",
69-
" 4: mostly Q4_1, some F16",
7075
" 7: mostly Q8_0",
7176
" 8: mostly Q5_0",
7277
" 9: mostly Q5_1",
@@ -84,7 +89,7 @@ static void usage(char *prog) {
8489
" requires: between [1, 3]",
8590
"--n_threads NTH bench with this number of threads",
8691
" requires: between [1, 16]",
87-
" default 1",
92+
" default 4",
8893
"--file FILE data file to write",
8994
" default stdout",
9095
"-y always answer \"yes\" to all prompts",
@@ -170,8 +175,22 @@ int main(int argc, char **argv) {
170175
ftype = (enum ggml_ftype)v;
171176
}
172177

178+
#ifndef SUPPORT_NONE_Q_TYPE
173179
if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
174-
fprintf(stderr, "none quantized type %d is not supported\n", ftype);
180+
fprintf(stderr, "error: none quantized type %d is not supported\n",
181+
ftype);
182+
return 1;
183+
}
184+
#endif
185+
186+
bool cond_1 = ftype >= GGML_FTYPE_MOSTLY_Q4_0 &&
187+
ftype <= GGML_FTYPE_MOSTLY_Q4_1;
188+
bool cond_2 =
189+
ftype >= GGML_FTYPE_MOSTLY_Q8_0 && ftype <= GGML_FTYPE_MOSTLY_Q6_K;
190+
191+
if (!(cond_1 || cond_2)) {
192+
fprintf(stderr, "error: type %d is not a known ggml ftype.\n",
193+
ftype);
175194
return 1;
176195
}
177196
}
@@ -223,7 +242,7 @@ int main(int argc, char **argv) {
223242
}
224243
}
225244

226-
int n_threads = 1;
245+
int n_threads = 4;
227246
{
228247
if (arg_n_threads != NULL) {
229248
int v = atoi(arg_n_threads);

ggml-opencl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
16281628
}
16291629

16301630
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
1631-
// GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
1631+
GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
16321632

16331633
if (src0->type == GGML_TYPE_F32) {
16341634
ggml_cl_mul_mat_f32(src0, src1, dst);

ggml-threading.c

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,8 @@ struct ggml_compute_state_shared {
170170
atomic_bool wait_on_done;
171171
atomic_bool stop;
172172

173-
ggml_threading_task_runner *task_runner;
173+
// Default task runner, can be overriden by node.task_profile.runner.
174+
ggml_task_runner *task_runner;
174175

175176
struct ggml_threading_context *ctx;
176177
};
@@ -391,8 +392,10 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
391392
}
392393

393394
if (shared->n_tasks > 0 && state->has_work) {
394-
enum ggml_compute_error err =
395-
shared->task_runner(&state->params, state->node);
395+
ggml_task_runner *runner = state->node->task_profile.runner
396+
? state->node->task_profile.runner
397+
: shared->task_runner;
398+
enum ggml_compute_error err = runner(&state->params, state->node);
396399

397400
GGML_ASSERT(err == GGML_COMPUTE_OK);
398401

@@ -427,8 +430,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
427430
size_t wsize) {
428431
GGML_ASSERT(ctx);
429432
GGML_ASSERT(node);
430-
431433
GGML_ASSERT(ctx->shared.task_runner);
434+
435+
ggml_task_runner *runner = ctx->shared.task_runner;
436+
if (node->task_profile.runner) {
437+
runner = node->task_profile.runner;
438+
}
439+
432440
struct ggml_compute_state_shared *state_shared = &ctx->shared;
433441

434442
// This is the params for main thread.
@@ -491,7 +499,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
491499
params.wsize = wsize;
492500
params.wdata = wdata;
493501

494-
err = state_shared->task_runner(&params, node);
502+
err = runner(&params, node);
495503
}
496504

497505
// wait for tasks done.
@@ -509,11 +517,21 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
509517

510518
if (err != GGML_COMPUTE_OK) {
511519
if (err == GGML_COMPUTE_FALLBACK) {
520+
PRINT_DEBUG("[main] fallback from profile, id=%d\n",
521+
node->task_profile.id);
522+
GGML_ASSERT(node->task_profile.stages[1].backend >
523+
GGML_TASK_BACKEND_CPU);
524+
512525
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
513526
int n = ggml_get_task_profiles(node, profiles);
514527
GGML_ASSERT(n > 0);
528+
GGML_ASSERT(profiles[0].stages[1].backend ==
529+
GGML_TASK_BACKEND_CPU);
530+
515531
memcpy(&node->task_profile, &profiles[0],
516532
sizeof(struct ggml_task_profile));
533+
runner = ctx->shared.task_runner;
534+
517535
goto START;
518536
}
519537
return err;
@@ -525,12 +543,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
525543

526544
struct ggml_threading_context *
527545
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
528-
ggml_threading_task_runner *task_stage_runner,
546+
ggml_task_runner *task_runner,
529547
enum ggml_threading_features features,
530548
int64_t stages_time[3]) {
531549
GGML_ASSERT(n_threads > 0);
532-
GGML_ASSERT(thread_runner);
533-
GGML_ASSERT(task_stage_runner);
550+
if (thread_runner == NULL) {
551+
thread_runner = ggml_threading_graph_compute_thread;
552+
}
534553

535554
size_t ctx_sz = sizeof(struct ggml_threading_context);
536555
struct ggml_threading_context *ctx = malloc(ctx_sz);
@@ -545,7 +564,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
545564
.wait_now = false,
546565
.wait_on_done = false,
547566
.stop = false,
548-
.task_runner = task_stage_runner,
567+
.task_runner = task_runner,
549568
.ctx = ctx,
550569
};
551570

ggml-threading.h

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,27 +21,21 @@ enum ggml_threading_features {
2121
GGML_THREADING_FEATURE_PERF = 1 << 1,
2222
};
2323

24-
// Compute errors.
25-
enum ggml_compute_error {
26-
GGML_COMPUTE_OK = 0,
27-
GGML_COMPUTE_FALLBACK = 1,
28-
};
29-
30-
// The task runner to be called by main thread and workers.
31-
typedef enum ggml_compute_error(ggml_threading_task_runner)(
32-
struct ggml_compute_params *params, struct ggml_tensor *node);
33-
3424
// The thread runner to feed into OS threads.
3525
typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
3626

3727
// Init and start underlying workers if n_threads > 1.
3828
//
39-
// features: optional for configure threading additional features.
40-
// see `ggml_threading_feature`, default 0.
29+
// thread: optional OS thread runner, default value:
30+
// `ggml_threading_graph_compute_thread`.
31+
//
32+
// features: optional for configure
33+
// threading additional features. see `ggml_threading_feature`, default 0.
34+
//
4135
// stages_time: optional for collecting per-stage wall clock time.
4236
struct ggml_threading_context *
4337
ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
44-
ggml_threading_task_runner *task_stage_runner,
38+
ggml_task_runner *task_runner,
4539
enum ggml_threading_features features,
4640
int64_t stages_time[3]);
4741

@@ -60,7 +54,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
6054

6155
// This is an experimental functionality for mulmat tune, as a thin wrapper.
6256
enum ggml_compute_error
63-
ggml_compute_forward_wrapper(struct ggml_compute_params *params,
57+
ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
6458
struct ggml_tensor *tensor);
6559

6660
#ifdef __cplusplus

ggml-tune.c

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@ ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
4444
}
4545
}
4646

47-
const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
48-
struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
49-
enum ggml_type src1_t, int stages_time[3]) {
47+
// NOTE: we can not use the profile from tune because the profiles do not
48+
// contain fields such as runner, get_size.
49+
int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
50+
int N, int K, enum ggml_type src0_t,
51+
enum ggml_type src1_t,
52+
int stages_time[3]) {
5053
GGML_ASSERT(tune);
5154

5255
// TODO: default_mm_cache is thread-unsafe.
@@ -103,15 +106,15 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
103106
names[i] = ggml_mulmat_tune_task_backend_name(
104107
prof->stages[i].backend);
105108
}
106-
printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
107-
"fastest profile: %s %s %s\n",
108-
M, N, K, names[0], names[1], names[2]);
109+
printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
110+
"backends: %s %s %s\n",
111+
M, N, K, prof->id, names[0], names[1], names[2]);
109112
#endif
110113
}
111114
}
112115
}
113116

114-
return prof;
117+
return prof->id;
115118
}
116119

117120
void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@@ -264,10 +267,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
264267
if (shape->m_num > 0) {
265268
if (shape->arr_m) {
266269
free(shape->arr_m);
270+
shape->arr_m = NULL;
267271
}
268272
if (shape->items) {
269273
free(shape->items);
274+
shape->items = NULL;
270275
}
276+
shape->m_num = 0;
271277
}
272278
}
273279
}
@@ -277,6 +283,11 @@ static bool ggml_mulmat_tune_write_profiles(
277283
int rc;
278284
for (int i = 0; i < n_profiles; i++) {
279285
const struct ggml_task_profile *profile = &profiles[i];
286+
rc = fprintf(fp, "%d ", profile->id);
287+
if (rc <= 0) {
288+
return false;
289+
}
290+
280291
for (int j = 0; j < 3; j++) {
281292
const struct ggml_task_stage *ts = &profile->stages[j];
282293
rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
@@ -304,7 +315,6 @@ static bool
304315
ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
305316
const char *model, int ftype, int n_threads,
306317
char *errbuf, int errbuf_len) {
307-
308318
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
309319
snprintf(errbuf, errbuf_len - 1,
310320
"version mismatch, built-in: %d, "
@@ -348,14 +358,28 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
348358
int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
349359

350360
if (n_profiles != shape->n_profiles) {
351-
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)");
361+
snprintf(errbuf, errbuf_len - 1,
362+
"task profiles mismatch (n_profiles)");
352363
return false;
353364
}
354365

355366
// TODO: profiles order is relevant, too strict.
356-
size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
357-
if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
358-
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)");
367+
// Only validate stages!
368+
size_t sz = sizeof(struct ggml_task_stage) * 3;
369+
bool matched = true;
370+
for (int j = 0; j < n_profiles; j++) {
371+
if (builtin_profiles[j].id != shape->profiles[j].id) {
372+
return false;
373+
}
374+
if (memcmp(builtin_profiles[j].stages, shape->profiles[j].stages,
375+
sz) != 0) {
376+
matched = false;
377+
break;
378+
}
379+
}
380+
if (!matched) {
381+
snprintf(errbuf, errbuf_len - 1,
382+
"task profiles mismatch (profiles)");
359383

360384
printf("=== built-in profiles:\n");
361385
ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
@@ -426,6 +450,12 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
426450

427451
for (int ip = 0; ip < shape->n_profiles; ip++) {
428452
struct ggml_task_profile *profile = &shape->profiles[ip];
453+
454+
rc = fscanf(fp, "%d ", &profile->id);
455+
if (rc <= 0) {
456+
return false;
457+
}
458+
429459
for (int j = 0; j < 3; j++) {
430460
struct ggml_task_stage *ts = &profile->stages[j];
431461
int backend;
@@ -777,6 +807,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
777807
GGML_ASSERT(params);
778808
GGML_ASSERT(params->model.name);
779809

810+
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
811+
780812
enum ggml_task_backend backends[16];
781813
int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
782814
if (n_backends < 2) {
@@ -785,6 +817,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
785817
return false;
786818
}
787819

820+
if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
821+
params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
822+
#if defined(GGML_USE_CLBLAST)
823+
printf("[tune] error: cl implementation does not support k_quants at "
824+
"the time of writing this code, skip.\n");
825+
return false;
826+
#endif
827+
}
828+
788829
bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
789830
if (!ok) {
790831
return false;
@@ -816,9 +857,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
816857
int64_t t0 = ggml_time_ms();
817858

818859
struct ggml_threading_context *thrd_ctx = ggml_threading_start(
819-
tune->n_threads, ggml_threading_graph_compute_thread,
820-
ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
821-
stages_time);
860+
tune->n_threads, NULL, ggml_compute_forward_wrapper,
861+
GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
822862

823863
for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
824864
const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];

0 commit comments

Comments
 (0)