Skip to content

Commit 6398663

Browse files
authored
Apply the GQA2/Expert2 conditionality to the IQ3 quants
In coherence with the proposed modifications to the IQ2 quant strategies, which make even more sense for the IQ3 quant strategies.
1 parent b77cdd8 commit 6398663

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

src/llama.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15371,15 +15371,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1537115371
}
1537215372
} else if (name.find("attn_v.weight") != std::string::npos) {
1537315373
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
15374-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15374+
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1537515375
}
15376-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
15376+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1537715377
new_type = GGML_TYPE_Q4_K;
1537815378
}
1537915379
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15380-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
15380+
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1538115381
}
15382-
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
15382+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1538315383
new_type = GGML_TYPE_Q4_K;
1538415384
}
1538515385
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {

0 commit comments

Comments
 (0)