@@ -16381,11 +16381,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16381
16381
};
16382
16382
// difquant_fl_more_tensors has a broad 26-29% bump to the upper quant. Ex : 9/32
16383
16383
auto difquant_fl_more_tensors = [](int i_layer, int n_layers) -> bool {
16384
- return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8;
16384
+ return i_layer <= n_layers/8 || i_layer == 2*n_layers/8 || i_layer > 7*n_layers/8;
16385
16385
};
16386
16386
// difquant_three_eights_tensors has a broad 37.5% bump to the upper quant. Ex : 12/32
16387
16387
auto difquant_three_eights_tensors = [](int i_layer, int n_layers) -> bool {
16388
- return i_layer <= n_layers/8 || i_layer >= 7 *n_layers/8 || ( i_layer > 2 *n_layers/8 && i_layer < 3 *n_layers/8) ;
16388
+ return i_layer <= n_layers/8 || ( i_layer >= 2 *n_layers/8 && i_layer < 3 *n_layers/8) || i_layer > 7 *n_layers/8;
16389
16389
};
16390
16390
// original formula use_more_bits :
16391
16391
// return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
@@ -16394,15 +16394,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16394
16394
// In the case of a 40 layers model, layers 6-9 and layers 15-20 are always skipped.
16395
16395
// difquant_half_tensors replaces it and keeps the broad 50% bump to the upper quant. Ex : 16/32
16396
16396
auto difquant_half_tensors = [](int i_layer, int n_layers) -> bool {
16397
- return i_layer <= n_layers/8 || i_layer > 6 *n_layers/8 || ( i_layer >= 2 *n_layers/8 && i_layer < 3 *n_layers/8) ;
16397
+ return i_layer <= n_layers/8 || ( i_layer >= 2 *n_layers/8 && i_layer < 3 *n_layers/8) || i_layer > 6 *n_layers/8;
16398
16398
};
16399
16399
// difquant_five_eights_tensors has a broad 62.5% bump to the upper quant. Ex : 20/32
16400
16400
auto difquant_five_eights_tensors = [](int i_layer, int n_layers) -> bool {
16401
- return i_layer <= n_layers/8 || i_layer > 5 *n_layers/8 || ( i_layer >= 2 *n_layers/8 && i_layer < 3 *n_layers/8) ;
16401
+ return i_layer <= n_layers/8 || ( i_layer >= 2 *n_layers/8 && i_layer < 3 *n_layers/8) || i_layer > 5 *n_layers/8;
16402
16402
};
16403
16403
// difquant_six_eights_tensors has a broad 75% bump to the upper quant. Ex : 24/32
16404
16404
auto difquant_six_eights_tensors = [](int i_layer, int n_layers) -> bool {
16405
- return i_layer <= n_layers/8 || i_layer > 4 *n_layers/8 || ( i_layer >= 2 *n_layers/8 && i_layer < 3 *n_layers/8) ;
16405
+ return i_layer <= n_layers/8 || ( i_layer >= 2 *n_layers/8 && i_layer < 3 *n_layers/8) || i_layer > 4 *n_layers/8;
16406
16406
};
16407
16407
// difquant_all_tensors has a broad 100% bump to the upper quant. Ex : 32/32. This, for easy mass edit purpose during tests.
16408
16408
auto difquant_all_tensors = [](int i_layer, int n_layers) -> bool {
@@ -16532,20 +16532,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16532
16532
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
16533
16533
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
16534
16534
else if (qs.model.hparams.n_expert >= 8)
16535
- new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
16535
+ new_type = difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
16536
16536
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
16537
16537
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16538
16538
else if (qs.model.hparams.n_expert >= 4)
16539
- new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16539
+ new_type = difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16540
16540
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
16541
16541
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16542
16542
else if (qs.model.hparams.n_expert >= 2)
16543
- new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16543
+ new_type = difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16544
16544
else if (qs.model.hparams.n_gqa() >= 4)
16545
16545
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16546
16546
else if (qs.model.hparams.n_gqa() >= 2)
16547
16547
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16548
- else new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16548
+ else new_type = difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16549
16549
}
16550
16550
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
16551
16551
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
@@ -16700,20 +16700,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16700
16700
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
16701
16701
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
16702
16702
else if (qs.model.hparams.n_expert >= 8)
16703
- new_type = difquant_five_eights_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
16703
+ new_type = difquant_half_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
16704
16704
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
16705
16705
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16706
16706
else if (qs.model.hparams.n_expert >= 4)
16707
- new_type = difquant_five_eights_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16707
+ new_type = difquant_half_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16708
16708
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
16709
16709
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16710
16710
else if (qs.model.hparams.n_expert >= 2)
16711
- new_type = difquant_five_eights_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16711
+ new_type = difquant_half_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16712
16712
else if (qs.model.hparams.n_gqa() >= 4)
16713
16713
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16714
16714
else if (qs.model.hparams.n_gqa() >= 2)
16715
16715
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16716
- else new_type = difquant_five_eights_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16716
+ else new_type = difquant_half_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16717
16717
}
16718
16718
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
16719
16719
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
@@ -16916,14 +16916,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16916
16916
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
16917
16917
new_type = difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16918
16918
else if (qs.model.hparams.n_expert >= 8)
16919
- new_type = difquant_five_eights_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16919
+ new_type = difquant_half_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16920
16920
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
16921
16921
new_type = difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16922
16922
else if (qs.model.hparams.n_expert >= 2)
16923
- new_type = difquant_five_eights_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16923
+ new_type = difquant_half_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16924
16924
else if (qs.model.hparams.n_gqa() >= 2)
16925
16925
new_type = difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16926
- else new_type = difquant_five_eights_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16926
+ else new_type = difquant_half_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16927
16927
}
16928
16928
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
16929
16929
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
@@ -17036,7 +17036,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17036
17036
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
17037
17037
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17038
17038
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17039
- else new_type = (difquant_five_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17039
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17040
17040
}
17041
17041
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
17042
17042
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17174,18 +17174,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17174
17174
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
17175
17175
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17176
17176
else if (qs.model.hparams.n_expert >= 8)
17177
- new_type = difquant_five_eights_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17177
+ new_type = difquant_half_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17178
17178
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
17179
17179
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17180
17180
else if (qs.model.hparams.n_expert >= 4)
17181
- new_type = difquant_five_eights_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17181
+ new_type = difquant_half_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17182
17182
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
17183
17183
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17184
17184
else if (qs.model.hparams.n_expert >= 2)
17185
- new_type = difquant_five_eights_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17185
+ new_type = difquant_half_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17186
17186
else if (qs.model.hparams.n_gqa() >= 2)
17187
17187
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17188
- else new_type = difquant_five_eights_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17188
+ else new_type = difquant_half_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17189
17189
}
17190
17190
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
17191
17191
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
@@ -17309,20 +17309,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17309
17309
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
17310
17310
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
17311
17311
else if (qs.model.hparams.n_expert >= 8)
17312
- new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
17312
+ new_type = difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
17313
17313
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
17314
17314
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17315
17315
else if (qs.model.hparams.n_expert >= 4)
17316
- new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17316
+ new_type = difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17317
17317
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
17318
17318
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17319
17319
else if (qs.model.hparams.n_expert >= 2)
17320
- new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17320
+ new_type = difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17321
17321
else if (qs.model.hparams.n_gqa() >= 4)
17322
17322
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17323
17323
else if (qs.model.hparams.n_gqa() >= 2)
17324
17324
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17325
- else new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17325
+ else new_type = difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17326
17326
}
17327
17327
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
17328
17328
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
0 commit comments