@@ -189,8 +189,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
189
189
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
190
190
bool is_one_bit = (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S);
191
191
if (name.find (" attn_v.weight" ) != std::string::npos) {
192
- if (qs.model .hparams .n_gqa () >= 4 || qs.model .hparams .n_expert >= 4 ) new_type = GGML_TYPE_Q4_K;
193
- else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
192
+ new_type = GGML_TYPE_Q4_K;
193
+ // if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
194
+ // else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
194
195
++qs.i_attention_wv ;
195
196
}
196
197
else if (qs.model .hparams .n_expert == 8 && name.find (" attn_k.weight" ) != std::string::npos) {
@@ -271,15 +272,29 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
271
272
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
272
273
// }
273
274
}
275
+ else if (name.find (" attn_k.weight" ) != std::string::npos) {
276
+ // Leave as 4bit
277
+ new_type = GGML_TYPE_Q4_K;
278
+ }
279
+ else if (name.find (" attn_q.weight" ) != std::string::npos) {
280
+ // Leave as 4bit
281
+ new_type = GGML_TYPE_Q4_K;
282
+ }
283
+ else if (name.find (" attn_v.weight" ) != std::string::npos) {
284
+ // Leave as 4bit
285
+ new_type = GGML_TYPE_Q4_K;
286
+ }
274
287
} else if (name.find (" attn_v.weight" ) != std::string::npos) {
275
288
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
276
- new_type = qs.model .hparams .n_gqa () >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
289
+ new_type = GGML_TYPE_Q4_K;
290
+ // new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
277
291
}
278
292
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model .hparams .n_gqa () >= 4 ) {
279
293
new_type = GGML_TYPE_Q4_K;
280
294
}
281
295
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
282
- new_type = qs.model .hparams .n_gqa () >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
296
+ new_type = GGML_TYPE_Q4_K;
297
+ // new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
283
298
}
284
299
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model .hparams .n_gqa () >= 4 ) {
285
300
new_type = GGML_TYPE_Q4_K;
@@ -316,17 +331,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
316
331
new_type = GGML_TYPE_Q8_0;
317
332
}
318
333
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
319
- new_type = GGML_TYPE_IQ3_XXS;
334
+ // new_type = GGML_TYPE_IQ3_XXS;
335
+ new_type = GGML_TYPE_Q4_K;
320
336
}
321
337
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
322
- new_type = GGML_TYPE_IQ2_S;
338
+ // new_type = GGML_TYPE_IQ2_S;
339
+ new_type = GGML_TYPE_Q4_K;
323
340
}
324
341
} else if (name.find (" attn_q.weight" ) != std::string::npos) {
325
342
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
326
- new_type = GGML_TYPE_IQ3_XXS;
343
+ // new_type = GGML_TYPE_IQ3_XXS;
344
+ new_type = GGML_TYPE_Q4_K;
327
345
}
328
346
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
329
- new_type = GGML_TYPE_IQ2_S;
347
+ // new_type = GGML_TYPE_IQ2_S;
348
+ new_type = GGML_TYPE_Q4_K;
330
349
}
331
350
} else if (name.find (" ffn_down.weight" ) != std::string::npos) {
332
351
// First 3 Layers
0 commit comments