Skip to content

Commit 2f652a5

Browse files
committed
Update llama-quant.cpp
1 parent 54eb4da commit 2f652a5

File tree

1 file changed

+27
-8
lines changed

1 file changed

+27
-8
lines changed

src/llama-quant.cpp

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
189189
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
190190
bool is_one_bit = (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S);
191191
if (name.find("attn_v.weight") != std::string::npos) {
192-
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
193-
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
192+
new_type = GGML_TYPE_Q4_K;
193+
// if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
194+
// else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
194195
++qs.i_attention_wv;
195196
}
196197
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
@@ -271,15 +272,29 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
271272
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
272273
// }
273274
}
275+
else if (name.find("attn_k.weight") != std::string::npos) {
276+
// Leave as 4bit
277+
new_type = GGML_TYPE_Q4_K;
278+
}
279+
else if (name.find("attn_q.weight") != std::string::npos) {
280+
// Leave as 4bit
281+
new_type = GGML_TYPE_Q4_K;
282+
}
283+
else if (name.find("attn_v.weight") != std::string::npos) {
284+
// Leave as 4bit
285+
new_type = GGML_TYPE_Q4_K;
286+
}
274287
} else if (name.find("attn_v.weight") != std::string::npos) {
275288
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
276-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
289+
new_type = GGML_TYPE_Q4_K;
290+
// new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
277291
}
278292
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
279293
new_type = GGML_TYPE_Q4_K;
280294
}
281295
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
282-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
296+
new_type = GGML_TYPE_Q4_K;
297+
// new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
283298
}
284299
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
285300
new_type = GGML_TYPE_Q4_K;
@@ -316,17 +331,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
316331
new_type = GGML_TYPE_Q8_0;
317332
}
318333
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
319-
new_type = GGML_TYPE_IQ3_XXS;
334+
// new_type = GGML_TYPE_IQ3_XXS;
335+
new_type = GGML_TYPE_Q4_K;
320336
}
321337
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
322-
new_type = GGML_TYPE_IQ2_S;
338+
// new_type = GGML_TYPE_IQ2_S;
339+
new_type = GGML_TYPE_Q4_K;
323340
}
324341
} else if (name.find("attn_q.weight") != std::string::npos) {
325342
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
326-
new_type = GGML_TYPE_IQ3_XXS;
343+
// new_type = GGML_TYPE_IQ3_XXS;
344+
new_type = GGML_TYPE_Q4_K;
327345
}
328346
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
329-
new_type = GGML_TYPE_IQ2_S;
347+
// new_type = GGML_TYPE_IQ2_S;
348+
new_type = GGML_TYPE_Q4_K;
330349
}
331350
} else if (name.find("ffn_down.weight") != std::string::npos) {
332351
// First 3 Layers

0 commit comments

Comments
 (0)