@@ -17839,7 +17839,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17839
17839
// }
17840
17840
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
17841
17841
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17842
- // new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17842
+ // new_type = difquant_six_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
17843
17843
// else new_type = GGML_TYPE_Q4_K;
17844
17844
// }
17845
17845
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
@@ -18040,7 +18040,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18040
18040
}
18041
18041
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18042
18042
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
18043
- // new_type = difquant_five_eights_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18043
+ // new_type = difquant_six_eights_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
18044
18044
else new_type = GGML_TYPE_IQ3_S;
18045
18045
}
18046
18046
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
@@ -18187,7 +18187,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18187
18187
// }
18188
18188
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18189
18189
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18190
- // new_type = difquant_five_eights_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18190
+ // new_type = difquant_six_eights_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
18191
18191
// else new_type = GGML_TYPE_IQ3_XXS;
18192
18192
// }
18193
18193
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
@@ -18328,7 +18328,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18328
18328
}
18329
18329
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18330
18330
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18331
- new_type = (difquant_five_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18331
+ new_type = (difquant_six_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18332
18332
else new_type = GGML_TYPE_IQ3_S;
18333
18333
}
18334
18334
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
@@ -18493,7 +18493,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18493
18493
// }
18494
18494
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18495
18495
// if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18496
- // new_type = difquant_five_eights_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18496
+ // new_type = difquant_six_eights_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18497
18497
// else new_type = GGML_TYPE_IQ3_S;
18498
18498
// }
18499
18499
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
@@ -18656,7 +18656,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18656
18656
}
18657
18657
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18658
18658
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18659
- new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18659
+ new_type = difquant_six_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18660
18660
else new_type = GGML_TYPE_IQ4_XS;
18661
18661
}
18662
18662
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
@@ -18773,8 +18773,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18773
18773
}
18774
18774
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18775
18775
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18776
- new_type = (difquant_five_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18777
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18776
+ new_type = (difquant_six_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18777
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18778
18778
}
18779
18779
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
18780
18780
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@@ -18883,8 +18883,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18883
18883
}
18884
18884
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18885
18885
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18886
- new_type = (difquant_five_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18887
- else new_type = (difquant_three_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18886
+ new_type = (difquant_six_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18887
+ else new_type = (difquant_half_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18888
18888
}
18889
18889
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
18890
18890
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
0 commit comments