Skip to content

Commit 5342dc0

Browse files
committed
tunning: support k_quants; disabled rope shapes (workaround); make cache thread safe; fixed shape comprison
1 parent 21e9379 commit 5342dc0

File tree

4 files changed

+85
-42
lines changed

4 files changed

+85
-42
lines changed

examples/mulmat-tune/mulmat-tune.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ int main(int argc, char **argv) {
170170
ftype = (enum ggml_ftype)v;
171171
}
172172

173-
if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {
174-
fprintf(stderr, "k_quants type %d is not implemented\n", ftype);
173+
if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
174+
fprintf(stderr, "none quantized type %d is not supported\n", ftype);
175175
return 1;
176176
}
177177
}

ggml-tune.c

Lines changed: 66 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
#include "ggml-tune.h"
55
#include "ggml.h"
66

7-
// MUL_MAT fine tunning for non-GPU-offloading cases.
7+
#ifdef GGML_USE_K_QUANTS
8+
#include "k_quants.h"
9+
#endif
810

9-
#define GGML_MULMAT_CACHE_LEN 16
10-
static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0};
11+
// MUL_MAT fine tunning for non-GPU-offloading cases.
1112

1213
#define FNV_OFFSET 14695981039346656037UL
1314
#define FNV_PRIME 1099511628211UL
@@ -49,9 +50,8 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
4950
GGML_ASSERT(tune);
5051

5152
// TODO: default_mm_cache is thread-unsafe.
52-
struct mm_cache_element *mm_cache = default_mm_cache;
5353
int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
54-
struct mm_cache_element *e = &mm_cache[slot];
54+
struct ggml_mulmat_tune_cache_ele *e = &tune->cache[slot];
5555

5656
struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
5757

@@ -183,7 +183,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
183183

184184
enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);
185185

186-
GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6);
186+
GGML_ASSERT(GGML_MULMAT_N_SHAPES == 4 || GGML_MULMAT_N_SHAPES == 6);
187187
tune->n_shapes = GGML_MULMAT_N_SHAPES;
188188

189189
// Attention layers
@@ -196,11 +196,26 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
196196
.N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
197197
tune->shapes[3] = (struct ggml_mulmat_tune_shape){
198198
.N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};
199-
// RoPE
200-
tune->shapes[4] = (struct ggml_mulmat_tune_shape){
201-
.N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type};
202-
tune->shapes[5] = (struct ggml_mulmat_tune_shape){
203-
.N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type};
199+
200+
tune->n_shapes = GGML_MULMAT_N_SHAPES;
201+
202+
if (GGML_MULMAT_N_SHAPES == 6) {
203+
// RoPE.
204+
// - very small comparing to previous, almost no need to bench.
205+
// - an Illegal instruction exception on Github (mac-latest-cmake).
206+
// - CL sometimes throws error on localhost.
207+
// So temporarily disabled as a workaround.
208+
tune->shapes[4] =
209+
(struct ggml_mulmat_tune_shape){.N = n_rot,
210+
.K = 0,
211+
.src0_type = rot_src0_type,
212+
.src1_type = src1_type};
213+
tune->shapes[5] =
214+
(struct ggml_mulmat_tune_shape){.N = 0,
215+
.K = n_rot,
216+
.src0_type = rot_src0_type,
217+
.src1_type = src1_type};
218+
}
204219

205220
for (int i = 0; i < tune->n_shapes; i++) {
206221
struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
@@ -225,6 +240,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
225240

226241
shape->m_num = params->m_num;
227242
shape->arr_m = malloc(shape->m_num * sizeof(int));
243+
GGML_ASSERT(shape->arr_m);
228244
for (int j = 0; j < shape->m_num; j++) {
229245
shape->arr_m[j] = 1 << j;
230246
}
@@ -245,11 +261,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
245261
GGML_ASSERT(shape);
246262

247263
// arr_m and items can be NULL only when testing.
248-
if (shape->arr_m) {
249-
free(shape->arr_m);
250-
}
251-
if (shape->items) {
252-
free(shape->items);
264+
if (shape->m_num > 0) {
265+
if (shape->arr_m) {
266+
free(shape->arr_m);
267+
}
268+
if (shape->items) {
269+
free(shape->items);
270+
}
253271
}
254272
}
255273
}
@@ -325,17 +343,19 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
325343
};
326344

327345
struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
346+
memset(builtin_profiles, 0, sizeof(builtin_profiles));
347+
328348
int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
329349

330350
if (n_profiles != shape->n_profiles) {
331-
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
351+
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)");
332352
return false;
333353
}
334354

335355
// TODO: profiles order is relevant, too strict.
336356
size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
337357
if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
338-
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
358+
snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)");
339359

340360
printf("=== built-in profiles:\n");
341361
ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
@@ -364,6 +384,9 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
364384
}
365385

366386
bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
387+
GGML_ASSERT(tune);
388+
memset(tune, 0, sizeof(struct ggml_mulmat_tune));
389+
367390
int rc = fscanf(fp, "%d", &tune->version);
368391
if (rc <= 0) {
369392
return false;
@@ -661,27 +684,42 @@ static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
661684
ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
662685
ggml_set_f32(src0_f32, 0.1f);
663686

687+
const float *src_data = (const float *)src0_f32->data;
688+
int nxk = N * K;
689+
664690
switch (src0_type) {
665691
case GGML_TYPE_Q4_0:
666-
ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K,
667-
K, hist);
692+
ggml_quantize_q4_0(src_data, src0->data, nxk, K, hist);
668693
break;
669694
case GGML_TYPE_Q4_1:
670-
ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K,
671-
K, hist);
695+
ggml_quantize_q4_1(src_data, src0->data, nxk, K, hist);
672696
break;
673697
case GGML_TYPE_Q5_0:
674-
ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K,
675-
K, hist);
698+
ggml_quantize_q5_0(src_data, src0->data, nxk, K, hist);
676699
break;
677700
case GGML_TYPE_Q5_1:
678-
ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K,
679-
K, hist);
701+
ggml_quantize_q5_1(src_data, src0->data, nxk, K, hist);
680702
break;
681703
case GGML_TYPE_Q8_0:
682-
ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K,
683-
K, hist);
704+
ggml_quantize_q8_0(src_data, src0->data, nxk, K, hist);
705+
break;
706+
#ifdef GGML_USE_K_QUANTS
707+
case GGML_TYPE_Q2_K:
708+
ggml_quantize_q2_K(src_data, src0->data, nxk, K, hist);
684709
break;
710+
case GGML_TYPE_Q3_K:
711+
ggml_quantize_q3_K(src_data, src0->data, nxk, K, hist);
712+
break;
713+
case GGML_TYPE_Q4_K:
714+
ggml_quantize_q4_K(src_data, src0->data, nxk, K, hist);
715+
break;
716+
case GGML_TYPE_Q5_K:
717+
ggml_quantize_q5_K(src_data, src0->data, nxk, K, hist);
718+
break;
719+
case GGML_TYPE_Q6_K:
720+
ggml_quantize_q6_K(src_data, src0->data, nxk, K, hist);
721+
break;
722+
#endif
685723
default:
686724
GGML_ASSERT(false);
687725
}

ggml-tune.h

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ extern "C" {
1111
#endif
1212

1313
#define GGML_MULMAT_TUNE_VERSION 8
14-
#define GGML_MULMAT_N_SHAPES 6
14+
#define GGML_MULMAT_N_SHAPES 4
15+
#define GGML_MULMAT_CACHE_LEN 16
1516

1617
#define GGML_MULMAT_MAX_PASS 3
1718

@@ -54,6 +55,14 @@ struct ggml_mulmat_tune_shape {
5455
struct ggml_mulmat_tune_m *items;
5556
};
5657

58+
struct ggml_mulmat_tune_cache_ele {
59+
int M;
60+
int N;
61+
int K;
62+
const struct ggml_task_profile *profile;
63+
int stages_time[3];
64+
};
65+
5766
struct ggml_mulmat_tune {
5867
int version;
5968

@@ -66,6 +75,9 @@ struct ggml_mulmat_tune {
6675
struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
6776

6877
int n_threads;
78+
79+
// Cache for time estimating.
80+
struct ggml_mulmat_tune_cache_ele cache[GGML_MULMAT_CACHE_LEN];
6981
};
7082

7183
struct ggml_mulmat_tune_time {
@@ -74,14 +86,6 @@ struct ggml_mulmat_tune_time {
7486
int total_time;
7587
};
7688

77-
struct mm_cache_element {
78-
int M;
79-
int N;
80-
int K;
81-
const struct ggml_task_profile *profile;
82-
int stages_time[3];
83-
};
84-
8589
// params for tune/bench.
8690
struct ggml_mulmat_tune_params {
8791
struct ggml_mulmat_tune_model model;

tests/test-ggml-tune.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,16 @@ static int bench(void) {
7070
// NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838
7171
enum ggml_ftype ftypes[] = {
7272
// GGML_FTYPE_ALL_F32,
73-
GGML_FTYPE_MOSTLY_F16,
73+
// GGML_FTYPE_MOSTLY_F16,
7474
GGML_FTYPE_MOSTLY_Q4_0,
75+
GGML_FTYPE_MOSTLY_Q4_K,
7576
};
7677

7778
int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
7879

7980
const int m_num = 4;
8081

81-
// Don't use n_threads larger than 2 because Github build hots has limited
82+
// Don't use n_threads larger than 2 because Github build hosts has limited
8283
// resource quota.
8384
int threads_arr[] = {1, 2};
8485
int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
@@ -124,7 +125,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
124125
}
125126

126127
int estimate_time_non_zero_NK(void) {
127-
printf("test-ggml-tune: %s\n", __func__);
128+
printf("[test-ggml-tune] %s\n", __func__);
128129

129130
struct test_data_t {
130131
int M;

0 commit comments

Comments
 (0)