Skip to content

Commit 06b0082

Browse files
committed
bulk refactoring task profile and related to run CL GPU offloading.
* removed ggml_task_backend, infavour of ggml_task_profile.runner and newly added id and name. * extracted mul_mat blas codes into ggml_compute_forward_mul_mat_blas, thus align with CUDA/CL a bit more and make it easier to fix profile and run tune. * rewrote task profile and update/add some cuda/cl codes, finnaly made CL GPU offloading work. * misc minor fix/update to tune, the data format was changed.
1 parent 6b83a3e commit 06b0082

15 files changed

+664
-764
lines changed

examples/mulmat-tune/README.md

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -214,26 +214,19 @@ The following results are generated with Accelerate compiled.
214214
**Example**
215215

216216
```
217-
5 3B 2 6 1
218-
219-
3200 3200 2 0 3 10
220-
16 0 0 0 16 1 0 1 0 0 0 0
221-
16 1 0 2 17 0 1 0 0 0 0 0
222-
0 0 0 0 34 0 1 0 0 0 0 0
223-
1 1 793 0 9103 2102 0 0 6014 0
224-
2 2 1591 0 8034 2305 0 0 30982 0
225-
4 4 2236 0 6476 2484 0 0 31388 0
226-
8 7 4161 0 6623 2389 0 0 29204 0
227-
16 15 8339 0 6434 2752 0 0 34303 0
228-
32 32 16919 0 6915 3651 0 0 42511 0
229-
64 200 34270 0 6574 4528 0 0 68212 0
230-
128 188 69400 0 6325 6839 0 0 74437 0
231-
256 303 134597 0 6168 11544 0 0 110180 0
232-
512 687 279685 0 6337 29712 0 0 159728 0
233-
234-
3200 8640 2 0 2 10
235-
236-
...
217+
[tune] done, elapsed time: 0 seconds.
218+
10 xB 12 4 2
219+
220+
1024 1024 12 0 2 4
221+
100 110 000 1 CPU
222+
110 101 000 2 BLAS
223+
1 11 309 0 1234 90 0
224+
2 23 654 0 1359 215 0
225+
4 44 1283 0 1362 421 0
226+
8 85 2341 0 1357 347 0
227+
228+
1024 2048 12 0 2 4
229+
...
237230
238231
```
239232

@@ -249,17 +242,17 @@ shape+
249242
# head
250243
version: 1
251244
model: "3B" | "7B" | "13B" | "30B" | "65B"
252-
ggml_ftype: 0 - 4, 7 - 14
245+
ggml_ftype: 0 - 3, 7 - 14
253246
n_shapes: number of shapes
254247
n_threads: number of threads
255248
256-
shape := N K m_num n_profiles
257-
task_conf_profile+
249+
shape := N K src0_ggml_type src1_ggml_type n_profiles m_num
250+
task_profile+
258251
bench_item+
259252
260-
task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
261-
stage_conf: backend parallel wait
262-
backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
253+
task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name
254+
stage_conf(bitmap): valid parallel wait
255+
valid: 0 (false) | 1 (true)
263256
parallel: 0 (false) | 1 (true)
264257
wait: 0 (false) | 1 (true)
265258

examples/mulmat-tune/mulmat-tune.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ static void usage(char *prog) {
111111
}
112112

113113
int main(int argc, char **argv) {
114+
if (!ggml_cpu_has_blas()) {
115+
fprintf(stderr, "error: this program is not built with BLAS.\n");
116+
return 1;
117+
}
118+
114119
if (argc == 2) {
115120
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
116121
usage(argv[0]);

ggml-cuda.cu

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml
22072207
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
22082208
}
22092209

2210-
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
2211-
const int64_t ne10 = src1->ne[0];
2212-
2213-
const int64_t ne0 = dst->ne[0];
2214-
const int64_t ne1 = dst->ne[1];
2215-
2210+
// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
2211+
static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
22162212
// TODO: find the optimal values for these
22172213
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
22182214
src1->type == GGML_TYPE_F32 &&
2219-
dst->type == GGML_TYPE_F32 &&
2220-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
2215+
dst->type == GGML_TYPE_F32) {
22212216
return true;
22222217
}
22232218

@@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() {
25392534
g_scratch_buffer = nullptr;
25402535
}
25412536

2542-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2543-
ggml_cuda_func_t func;
2544-
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
2537+
bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
2538+
GGML_ASSERT(tensor);
2539+
GGML_ASSERT(tensor->src0);
2540+
return tensor->backend == GGML_BACKEND_GPU
25452541
|| tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
25462542
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
2543+
}
2544+
2545+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2546+
ggml_cuda_func_t func;
2547+
const bool any_on_device = is_gpu_offloading(tensor);
25472548

25482549
switch (tensor->op) {
25492550
case GGML_OP_ADD:
@@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
25712572
func = ggml_cuda_rms_norm;
25722573
break;
25732574
case GGML_OP_MUL_MAT:
2574-
if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
2575+
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
25752576
return false;
25762577
}
25772578
func = ggml_cuda_mul_mat;

ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ void ggml_init_cublas(void);
1616
void ggml_cuda_set_tensor_split(const float * tensor_split);
1717

1818
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
19-
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
19+
bool ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0);
2020
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
2121
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
2222

ggml-opencl.cpp

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
15891589
}
15901590
}
15911591

1592+
bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) {
1593+
GGML_ASSERT(tensor);
1594+
return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) ||
1595+
(tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU);
1596+
}
15921597

1593-
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1594-
const int64_t ne10 = src1->ne[0];
1595-
1596-
const int64_t ne0 = dst->ne[0];
1597-
const int64_t ne1 = dst->ne[1];
1598-
1599-
// TODO: find the optimal values for these
1598+
// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
1599+
static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
16001600
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
16011601
src1->type == GGML_TYPE_F32 &&
1602-
dst->type == GGML_TYPE_F32 /*&&
1603-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
1602+
dst->type == GGML_TYPE_F32) {
16041603
return true;
16051604
}
16061605

ggml-opencl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ extern "C" {
99
void ggml_cl_init(void);
1010

1111
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12-
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12+
bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor);
1313
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
1414
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
1515

ggml-threading.c

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
376376

377377
struct ggml_compute_state_shared *shared = state->shared;
378378
GGML_ASSERT(shared);
379-
GGML_ASSERT(shared->task_runner);
379+
//GGML_ASSERT(shared->task_runner);
380380

381381
shared->n_ready++;
382382

@@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
397397
: shared->task_runner;
398398
enum ggml_compute_error err = runner(&state->params, state->node);
399399

400-
GGML_ASSERT(err == GGML_COMPUTE_OK);
400+
GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
401401

402402
ggml_spin_lock(&shared->spin);
403403

@@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
430430
size_t wsize) {
431431
GGML_ASSERT(ctx);
432432
GGML_ASSERT(node);
433-
GGML_ASSERT(ctx->shared.task_runner);
433+
// GGML_ASSERT(ctx->shared.task_runner);
434434

435435
ggml_task_runner *runner = ctx->shared.task_runner;
436436
if (node->task_profile.runner) {
@@ -448,7 +448,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
448448
memset(&params, 0, sizeof(struct ggml_compute_params));
449449

450450
for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
451-
if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
451+
if (!node->task_profile.stages[type].valid) {
452452
continue;
453453
}
454454

@@ -519,18 +519,17 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
519519
if (err == GGML_COMPUTE_FALLBACK) {
520520
PRINT_DEBUG("[main] fallback from profile, id=%d\n",
521521
node->task_profile.id);
522-
GGML_ASSERT(node->task_profile.stages[1].backend >
523-
GGML_TASK_BACKEND_CPU);
522+
GGML_ASSERT(node->task_profile.id > 1);
524523

525524
struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
526525
int n = ggml_get_task_profiles(node, profiles);
527526
GGML_ASSERT(n > 0);
528-
GGML_ASSERT(profiles[0].stages[1].backend ==
529-
GGML_TASK_BACKEND_CPU);
527+
GGML_ASSERT(profiles[0].id == 1);
530528

531529
memcpy(&node->task_profile, &profiles[0],
532-
sizeof(struct ggml_task_profile));
530+
sizeof(struct ggml_task_profile));
533531
runner = ctx->shared.task_runner;
532+
GGML_ASSERT(runner);
534533

535534
goto START;
536535
}

ggml-threading.h

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
2929
// thread: optional OS thread runner, default value:
3030
// `ggml_threading_graph_compute_thread`.
3131
//
32-
// features: optional for configure
32+
// task_runner: default task runner, nullable wheen tensor.runner is not NULL.
33+
// Overridden by tensor.runner.
34+
// features: configure threading behaviour, optional.
3335
// threading additional features. see `ggml_threading_feature`, default 0.
3436
//
3537
// stages_time: optional for collecting per-stage wall clock time.
@@ -51,12 +53,6 @@ enum ggml_compute_error
5153
ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
5254
struct ggml_tensor *node, void *wdata,
5355
size_t wsize);
54-
55-
// This is an experimental functionality for mulmat tune, as a thin wrapper.
56-
enum ggml_compute_error
57-
ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
58-
struct ggml_tensor *tensor);
59-
6056
#ifdef __cplusplus
6157
}
6258
#endif

0 commit comments

Comments
 (0)