Skip to content

Commit ee4ccb3

Browse files
committed
Shrink a bit Q2_K when GQA<2
and optimize difquants_first_last and fl_more
1 parent ea0690b commit ee4ccb3

File tree

1 file changed

+25
-25
lines changed

1 file changed

+25
-25
lines changed

src/llama.cpp

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16381,11 +16381,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1638116381
};
1638216382
// difquant_fl_more_tensors has a broad 26-29% bump to the upper quant. Ex : 9/32
1638316383
auto difquant_fl_more_tensors = [](int i_layer, int n_layers) -> bool {
16384-
return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8;
16384+
return i_layer <= n_layers/8 || i_layer == 2*n_layers/8 || i_layer > 7*n_layers/8;
1638516385
};
1638616386
// difquant_three_eights_tensors has a broad 37.5% bump to the upper quant. Ex : 12/32
1638716387
auto difquant_three_eights_tensors = [](int i_layer, int n_layers) -> bool {
16388-
return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer > 2*n_layers/8 && i_layer < 3*n_layers/8);
16388+
return i_layer <= n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8) || i_layer > 7*n_layers/8;
1638916389
};
1639016390
// original formula use_more_bits :
1639116391
// return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
@@ -16394,15 +16394,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1639416394
// In the case of a 40 layers model, layers 6-9 and layers 15-20 are always skipped.
1639516395
// difquant_half_tensors replaces it and keeps the broad 50% bump to the upper quant. Ex : 16/32
1639616396
auto difquant_half_tensors = [](int i_layer, int n_layers) -> bool {
16397-
return i_layer <= n_layers/8 || i_layer > 6*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
16397+
return i_layer <= n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8) || i_layer > 6*n_layers/8;
1639816398
};
1639916399
// difquant_five_eights_tensors has a broad 62.5% bump to the upper quant. Ex : 20/32
1640016400
auto difquant_five_eights_tensors = [](int i_layer, int n_layers) -> bool {
16401-
return i_layer <= n_layers/8 || i_layer > 5*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
16401+
return i_layer <= n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8) || i_layer > 5*n_layers/8;
1640216402
};
1640316403
// difquant_six_eights_tensors has a broad 75% bump to the upper quant. Ex : 24/32
1640416404
auto difquant_six_eights_tensors = [](int i_layer, int n_layers) -> bool {
16405-
return i_layer <= n_layers/8 || i_layer > 4*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8);
16405+
return i_layer <= n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8) || i_layer > 4*n_layers/8;
1640616406
};
1640716407
// difquant_all_tensors has a broad 100% bump to the upper quant. Ex : 32/32. This, for easy mass edit purpose during tests.
1640816408
auto difquant_all_tensors = [](int i_layer, int n_layers) -> bool {
@@ -16532,20 +16532,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1653216532
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1653316533
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1653416534
else if (qs.model.hparams.n_expert >= 8)
16535-
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
16535+
new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1653616536
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
1653716537
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1653816538
else if (qs.model.hparams.n_expert >= 4)
16539-
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16539+
new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1654016540
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
1654116541
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1654216542
else if (qs.model.hparams.n_expert >= 2)
16543-
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16543+
new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1654416544
else if (qs.model.hparams.n_gqa() >= 4)
1654516545
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1654616546
else if (qs.model.hparams.n_gqa() >= 2)
1654716547
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16548-
else new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16548+
else new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1654916549
}
1655016550
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1655116551
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
@@ -16700,20 +16700,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1670016700
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1670116701
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1670216702
else if (qs.model.hparams.n_expert >= 8)
16703-
new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
16703+
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1670416704
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
1670516705
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1670616706
else if (qs.model.hparams.n_expert >= 4)
16707-
new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16707+
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1670816708
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
1670916709
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1671016710
else if (qs.model.hparams.n_expert >= 2)
16711-
new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16711+
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1671216712
else if (qs.model.hparams.n_gqa() >= 4)
1671316713
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1671416714
else if (qs.model.hparams.n_gqa() >= 2)
1671516715
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16716-
else new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16716+
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
1671716717
}
1671816718
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1671916719
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
@@ -16916,14 +16916,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1691616916
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1691716917
new_type = difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1691816918
else if (qs.model.hparams.n_expert >= 8)
16919-
new_type = difquant_five_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16919+
new_type = difquant_half_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1692016920
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
1692116921
new_type = difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1692216922
else if (qs.model.hparams.n_expert >= 2)
16923-
new_type = difquant_five_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16923+
new_type = difquant_half_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1692416924
else if (qs.model.hparams.n_gqa() >= 2)
1692516925
new_type = difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16926-
else new_type = difquant_five_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
16926+
else new_type = difquant_half_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
1692716927
}
1692816928
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1692916929
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
@@ -17036,7 +17036,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1703617036
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
1703717037
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1703817038
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17039-
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17039+
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
1704017040
}
1704117041
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
1704217042
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -17174,18 +17174,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1717417174
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1717517175
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1717617176
else if (qs.model.hparams.n_expert >= 8)
17177-
new_type = difquant_five_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17177+
new_type = difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1717817178
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
1717917179
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1718017180
else if (qs.model.hparams.n_expert >= 4)
17181-
new_type = difquant_five_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17181+
new_type = difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1718217182
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
1718317183
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1718417184
else if (qs.model.hparams.n_expert >= 2)
17185-
new_type = difquant_five_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17185+
new_type = difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1718617186
else if (qs.model.hparams.n_gqa() >= 2)
1718717187
new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17188-
else new_type = difquant_five_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
17188+
else new_type = difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
1718917189
}
1719017190
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1719117191
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
@@ -17309,20 +17309,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1730917309
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)
1731017310
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1731117311
else if (qs.model.hparams.n_expert >= 8)
17312-
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
17312+
new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q8_0 : GGML_TYPE_Q6_K;
1731317313
else if ((qs.model.hparams.n_expert >= 4 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 8)
1731417314
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1731517315
else if (qs.model.hparams.n_expert >= 4)
17316-
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
17316+
new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1731717317
else if ((qs.model.hparams.n_expert >= 2 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 7)
1731817318
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1731917319
else if (qs.model.hparams.n_expert >= 2)
17320-
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17320+
new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1732117321
else if (qs.model.hparams.n_gqa() >= 4)
1732217322
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1732317323
else if (qs.model.hparams.n_gqa() >= 2)
1732417324
new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17325-
else new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
17325+
else new_type = difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1732617326
}
1732717327
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
1732817328
if ((qs.model.hparams.n_expert >= 8 && qs.model.hparams.n_gqa() >= 2) || qs.model.hparams.n_gqa() >= 12)

0 commit comments

Comments
 (0)