@@ -5311,6 +5311,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5311
5311
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
5312
5312
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.05 bpw";
5313
5313
case LLAMA_FTYPE_MOSTLY_IQ3_UXL: return "IQ3_S mix - 4.15 bpw";
5314
+ case LLAMA_FTYPE_MOSTLY_IQ4_XXSR: return "IQ4_XS mix - 4.xx bpw";
5314
5315
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
5315
5316
case LLAMA_FTYPE_MOSTLY_IQ4_MR: return "IQ4_XS mix - 4.xx bpw";
5316
5317
case LLAMA_FTYPE_MOSTLY_IQ4_LR: return "IQ4_XS mix - 4.xx bpw";
@@ -18451,7 +18452,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18451
18452
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
18452
18453
else new_type = GGML_TYPE_Q5_K;
18453
18454
}
18454
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18455
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18455
18456
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
18456
18457
else new_type = GGML_TYPE_Q5_K;
18457
18458
}
@@ -18699,10 +18700,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18699
18700
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
18700
18701
else new_type = GGML_TYPE_IQ4_XS;
18701
18702
}
18702
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18703
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18703
18704
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
18704
- new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
18705
- difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
18705
+ new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
18706
18706
}
18707
18707
else new_type = GGML_TYPE_IQ4_XS;
18708
18708
}
@@ -18877,6 +18877,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
18877
18877
new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18878
18878
else new_type = GGML_TYPE_Q3_K;
18879
18879
}
18880
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR) {
18881
+ if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18882
+ new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18883
+ else new_type = GGML_TYPE_Q3_K;
18884
+ }
18880
18885
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_MR) {
18881
18886
// if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18882
18887
// new_type = GGML_TYPE_IQ3_S;
@@ -19936,10 +19941,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
19936
19941
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
19937
19942
case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break;
19938
19943
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
19944
+ case LLAMA_FTYPE_MOSTLY_IQ4_XXSR:
19945
+ case LLAMA_FTYPE_MOSTLY_IQ4_XSR:
19946
+ case LLAMA_FTYPE_MOSTLY_IQ4_MR:
19947
+ case LLAMA_FTYPE_MOSTLY_IQ4_LR:
19939
19948
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
19940
- case LLAMA_FTYPE_MOSTLY_IQ4_XSR: default_type = GGML_TYPE_IQ4_XS; break;
19941
- case LLAMA_FTYPE_MOSTLY_IQ4_MR: default_type = GGML_TYPE_IQ4_XS; break;
19942
- case LLAMA_FTYPE_MOSTLY_IQ4_LR: default_type = GGML_TYPE_IQ4_XS; break;
19943
19949
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
19944
19950
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
19945
19951
case LLAMA_FTYPE_MOSTLY_IQ3_ML: default_type = GGML_TYPE_IQ3_S; break;
0 commit comments