4
4
#include "ggml-tune.h"
5
5
#include "ggml.h"
6
6
7
- // MUL_MAT fine tunning for non-GPU-offloading cases.
7
+ #ifdef GGML_USE_K_QUANTS
8
+ #include "k_quants.h"
9
+ #endif
8
10
9
- #define GGML_MULMAT_CACHE_LEN 16
10
- static struct mm_cache_element default_mm_cache [GGML_MULMAT_CACHE_LEN ] = {0 };
11
+ // MUL_MAT fine tunning for non-GPU-offloading cases.
11
12
12
13
#define FNV_OFFSET 14695981039346656037UL
13
14
#define FNV_PRIME 1099511628211UL
@@ -49,9 +50,8 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
49
50
GGML_ASSERT (tune );
50
51
51
52
// TODO: default_mm_cache is thread-unsafe.
52
- struct mm_cache_element * mm_cache = default_mm_cache ;
53
53
int slot = ggml_mulmat_tune_cache_hash (M , N , K ) % GGML_MULMAT_CACHE_LEN ;
54
- struct mm_cache_element * e = & mm_cache [slot ];
54
+ struct ggml_mulmat_tune_cache_ele * e = & tune -> cache [slot ];
55
55
56
56
struct ggml_mulmat_tune_time profiles_time [GGML_MAX_TASK_PROFILES ] = {0 };
57
57
@@ -183,7 +183,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
183
183
184
184
enum ggml_type type = ggml_ftype_to_ggml_type (model -> ftype );
185
185
186
- GGML_ASSERT (GGML_MULMAT_N_SHAPES > = 6 );
186
+ GGML_ASSERT (GGML_MULMAT_N_SHAPES == 4 || GGML_MULMAT_N_SHAPES = = 6 );
187
187
tune -> n_shapes = GGML_MULMAT_N_SHAPES ;
188
188
189
189
// Attention layers
@@ -196,11 +196,26 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
196
196
.N = n_ff , .K = n_embd , .src0_type = type , .src1_type = src1_type };
197
197
tune -> shapes [3 ] = (struct ggml_mulmat_tune_shape ){
198
198
.N = n_vocab , .K = n_embd , .src0_type = type , .src1_type = src1_type };
199
- // RoPE
200
- tune -> shapes [4 ] = (struct ggml_mulmat_tune_shape ){
201
- .N = n_rot , .K = 0 , .src0_type = rot_src0_type , .src1_type = src1_type };
202
- tune -> shapes [5 ] = (struct ggml_mulmat_tune_shape ){
203
- .N = 0 , .K = n_rot , .src0_type = rot_src0_type , .src1_type = src1_type };
199
+
200
+ tune -> n_shapes = GGML_MULMAT_N_SHAPES ;
201
+
202
+ if (GGML_MULMAT_N_SHAPES == 6 ) {
203
+ // RoPE.
204
+ // - very small comparing to previous, almost no need to bench.
205
+ // - an Illegal instruction exception on Github (mac-latest-cmake).
206
+ // - CL sometimes throws error on localhost.
207
+ // So temporarily disabled as a workaround.
208
+ tune -> shapes [4 ] =
209
+ (struct ggml_mulmat_tune_shape ){.N = n_rot ,
210
+ .K = 0 ,
211
+ .src0_type = rot_src0_type ,
212
+ .src1_type = src1_type };
213
+ tune -> shapes [5 ] =
214
+ (struct ggml_mulmat_tune_shape ){.N = 0 ,
215
+ .K = n_rot ,
216
+ .src0_type = rot_src0_type ,
217
+ .src1_type = src1_type };
218
+ }
204
219
205
220
for (int i = 0 ; i < tune -> n_shapes ; i ++ ) {
206
221
struct ggml_mulmat_tune_shape * shape = & tune -> shapes [i ];
@@ -225,6 +240,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
225
240
226
241
shape -> m_num = params -> m_num ;
227
242
shape -> arr_m = malloc (shape -> m_num * sizeof (int ));
243
+ GGML_ASSERT (shape -> arr_m );
228
244
for (int j = 0 ; j < shape -> m_num ; j ++ ) {
229
245
shape -> arr_m [j ] = 1 << j ;
230
246
}
@@ -245,11 +261,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
245
261
GGML_ASSERT (shape );
246
262
247
263
// arr_m and items can be NULL only when testing.
248
- if (shape -> arr_m ) {
249
- free (shape -> arr_m );
250
- }
251
- if (shape -> items ) {
252
- free (shape -> items );
264
+ if (shape -> m_num > 0 ) {
265
+ if (shape -> arr_m ) {
266
+ free (shape -> arr_m );
267
+ }
268
+ if (shape -> items ) {
269
+ free (shape -> items );
270
+ }
253
271
}
254
272
}
255
273
}
@@ -325,17 +343,19 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
325
343
};
326
344
327
345
struct ggml_task_profile builtin_profiles [GGML_MAX_TASK_PROFILES ];
346
+ memset (builtin_profiles , 0 , sizeof (builtin_profiles ));
347
+
328
348
int n_profiles = ggml_get_task_profiles (& node , builtin_profiles );
329
349
330
350
if (n_profiles != shape -> n_profiles ) {
331
- snprintf (errbuf , errbuf_len - 1 , "task profiles mismatch" );
351
+ snprintf (errbuf , errbuf_len - 1 , "task profiles mismatch(n_profiles) " );
332
352
return false;
333
353
}
334
354
335
355
// TODO: profiles order is relevant, too strict.
336
356
size_t sz = sizeof (struct ggml_task_profile ) * n_profiles ;
337
357
if (memcmp (builtin_profiles , shape -> profiles , sz ) != 0 ) {
338
- snprintf (errbuf , errbuf_len - 1 , "task profiles mismatch" );
358
+ snprintf (errbuf , errbuf_len - 1 , "task profiles mismatch(profiles) " );
339
359
340
360
printf ("=== built-in profiles:\n" );
341
361
ggml_mulmat_tune_write_profiles (stderr , builtin_profiles ,
@@ -364,6 +384,9 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
364
384
}
365
385
366
386
bool ggml_mulmat_tune_read_data (struct ggml_mulmat_tune * tune , FILE * fp ) {
387
+ GGML_ASSERT (tune );
388
+ memset (tune , 0 , sizeof (struct ggml_mulmat_tune ));
389
+
367
390
int rc = fscanf (fp , "%d" , & tune -> version );
368
391
if (rc <= 0 ) {
369
392
return false;
@@ -661,27 +684,42 @@ static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
661
684
ggml_new_tensor_2d (* ctx , GGML_TYPE_F32 , (int64_t )K , (int64_t )N );
662
685
ggml_set_f32 (src0_f32 , 0.1f );
663
686
687
+ const float * src_data = (const float * )src0_f32 -> data ;
688
+ int nxk = N * K ;
689
+
664
690
switch (src0_type ) {
665
691
case GGML_TYPE_Q4_0 :
666
- ggml_quantize_q4_0 ((const float * )src0_f32 -> data , src0 -> data , N * K ,
667
- K , hist );
692
+ ggml_quantize_q4_0 (src_data , src0 -> data , nxk , K , hist );
668
693
break ;
669
694
case GGML_TYPE_Q4_1 :
670
- ggml_quantize_q4_1 ((const float * )src0_f32 -> data , src0 -> data , N * K ,
671
- K , hist );
695
+ ggml_quantize_q4_1 (src_data , src0 -> data , nxk , K , hist );
672
696
break ;
673
697
case GGML_TYPE_Q5_0 :
674
- ggml_quantize_q5_0 ((const float * )src0_f32 -> data , src0 -> data , N * K ,
675
- K , hist );
698
+ ggml_quantize_q5_0 (src_data , src0 -> data , nxk , K , hist );
676
699
break ;
677
700
case GGML_TYPE_Q5_1 :
678
- ggml_quantize_q5_1 ((const float * )src0_f32 -> data , src0 -> data , N * K ,
679
- K , hist );
701
+ ggml_quantize_q5_1 (src_data , src0 -> data , nxk , K , hist );
680
702
break ;
681
703
case GGML_TYPE_Q8_0 :
682
- ggml_quantize_q8_0 ((const float * )src0_f32 -> data , src0 -> data , N * K ,
683
- K , hist );
704
+ ggml_quantize_q8_0 (src_data , src0 -> data , nxk , K , hist );
705
+ break ;
706
+ #ifdef GGML_USE_K_QUANTS
707
+ case GGML_TYPE_Q2_K :
708
+ ggml_quantize_q2_K (src_data , src0 -> data , nxk , K , hist );
684
709
break ;
710
+ case GGML_TYPE_Q3_K :
711
+ ggml_quantize_q3_K (src_data , src0 -> data , nxk , K , hist );
712
+ break ;
713
+ case GGML_TYPE_Q4_K :
714
+ ggml_quantize_q4_K (src_data , src0 -> data , nxk , K , hist );
715
+ break ;
716
+ case GGML_TYPE_Q5_K :
717
+ ggml_quantize_q5_K (src_data , src0 -> data , nxk , K , hist );
718
+ break ;
719
+ case GGML_TYPE_Q6_K :
720
+ ggml_quantize_q6_K (src_data , src0 -> data , nxk , K , hist );
721
+ break ;
722
+ #endif
685
723
default :
686
724
GGML_ASSERT (false);
687
725
}
0 commit comments