@@ -18618,7 +18618,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18618
18618
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS)
18619
18619
new_type = GGML_TYPE_IQ3_XXS;
18620
18620
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS)
18621
- new_type = GGML_TYPE_Q3_K ;
18621
+ new_type = GGML_TYPE_IQ3_XXS ;
18622
18622
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S)
18623
18623
new_type = GGML_TYPE_IQ3_S;
18624
18624
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_ML || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
@@ -18833,7 +18833,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18833
18833
}
18834
18834
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18835
18835
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
18836
- new_type = difquant_six_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18836
+ new_type = difquant_five_eights_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18837
18837
else new_type = GGML_TYPE_IQ4_XS;
18838
18838
}
18839
18839
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
@@ -18971,7 +18971,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18971
18971
}
18972
18972
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
18973
18973
if (qs.model.hparams.n_vocab >= 151600 && qs.model.hparams.n_vocab <=151700)
18974
- new_type = (difquant_five_eights_tensors (i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
18974
+ new_type = (difquant_first_last_tensors (i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
18975
18975
else if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18976
18976
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
18977
18977
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
@@ -19104,7 +19104,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19104
19104
}
19105
19105
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
19106
19106
if (qs.model.hparams.n_vocab >= 151600 && qs.model.hparams.n_vocab <=151700)
19107
- new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS;
19107
+ new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
19108
19108
else if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
19109
19109
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
19110
19110
}
0 commit comments