Skip to content

Commit ba1b854

Browse files
committed
New FTYPE IQ4_XXSR
and beef up attn_k IQ4_XSR
1 parent 79fa98c commit ba1b854

File tree

4 files changed

+22
-13
lines changed

4 files changed

+22
-13
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
5050
{ "Q3_K_XL", LLAMA_FTYPE_MOSTLY_Q3_K_XL, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
5151
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
5252
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
53+
{ "IQ4_XXSR", LLAMA_FTYPE_MOSTLY_IQ4_XXSR, " 4.xx bpw non-linear quantization", },
5354
{ "IQ4_XSR", LLAMA_FTYPE_MOSTLY_IQ4_XSR, " 4.xx bpw non-linear quantization", },
5455
{ "IQ4_MR", LLAMA_FTYPE_MOSTLY_IQ4_MR, " 4.xx bpw non-linear quantization", },
5556
{ "IQ4_LR", LLAMA_FTYPE_MOSTLY_IQ4_LR, " 4.xx bpw non-linear quantization", },

gguf-py/gguf/constants.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1460,9 +1460,10 @@ class LlamaFileType(IntEnum):
14601460
MOSTLY_IQ3_ML = 45 # except 1d tensors
14611461
MOSTLY_IQ3_XXXL = 46 # except 1d tensors
14621462
MOSTLY_IQ3_UXL = 47 # except 1d tensors
1463-
MOSTLY_IQ4_XSR = 48 # except 1d tensors
1464-
MOSTLY_IQ4_MR = 49 # except 1d tensors
1465-
MOSTLY_IQ4_LR = 50 # except 1d tensors
1463+
MOSTLY_IQ4_XXSR = 48 # except 1d tensors
1464+
MOSTLY_IQ4_XSR = 49 # except 1d tensors
1465+
MOSTLY_IQ4_MR = 50 # except 1d tensors
1466+
MOSTLY_IQ4_LR = 51 # except 1d tensors
14661467
MOSTLY_CQS = 99 # except 1d tensors
14671468

14681469
GUESSED = 1024 # not specified in the model file

include/llama.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,10 @@ extern "C" {
185185
LLAMA_FTYPE_MOSTLY_IQ3_ML = 45, // except 1d tensors
186186
LLAMA_FTYPE_MOSTLY_IQ3_XXXL = 46, // except 1d tensors
187187
LLAMA_FTYPE_MOSTLY_IQ3_UXL = 47, // except 1d tensors
188-
LLAMA_FTYPE_MOSTLY_IQ4_XSR = 48, // except 1d tensors
189-
LLAMA_FTYPE_MOSTLY_IQ4_MR = 49, // except 1d tensors
190-
LLAMA_FTYPE_MOSTLY_IQ4_LR = 50, // except 1d tensors
188+
LLAMA_FTYPE_MOSTLY_IQ4_XXSR = 48, // except 1d tensors
189+
LLAMA_FTYPE_MOSTLY_IQ4_XSR = 49, // except 1d tensors
190+
LLAMA_FTYPE_MOSTLY_IQ4_MR = 50, // except 1d tensors
191+
LLAMA_FTYPE_MOSTLY_IQ4_LR = 51, // except 1d tensors
191192
LLAMA_FTYPE_CQS = 99, // except 1d tensors
192193

193194
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file

src/llama.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5311,6 +5311,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
53115311
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
53125312
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.05 bpw";
53135313
case LLAMA_FTYPE_MOSTLY_IQ3_UXL: return "IQ3_S mix - 4.15 bpw";
5314+
case LLAMA_FTYPE_MOSTLY_IQ4_XXSR: return "IQ4_XS mix - 4.xx bpw";
53145315
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
53155316
case LLAMA_FTYPE_MOSTLY_IQ4_MR: return "IQ4_XS mix - 4.xx bpw";
53165317
case LLAMA_FTYPE_MOSTLY_IQ4_LR: return "IQ4_XS mix - 4.xx bpw";
@@ -18451,7 +18452,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1845118452
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1845218453
else new_type = GGML_TYPE_Q5_K;
1845318454
}
18454-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18455+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1845518456
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K;
1845618457
else new_type = GGML_TYPE_Q5_K;
1845718458
}
@@ -18699,10 +18700,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1869918700
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1870018701
else new_type = GGML_TYPE_IQ4_XS;
1870118702
}
18702-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
18703+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1870318704
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
18704-
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
18705-
difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
18705+
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1870618706
}
1870718707
else new_type = GGML_TYPE_IQ4_XS;
1870818708
}
@@ -18877,6 +18877,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1887718877
new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1887818878
else new_type = GGML_TYPE_Q3_K;
1887918879
}
18880+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XXSR) {
18881+
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
18882+
new_type = difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
18883+
else new_type = GGML_TYPE_Q3_K;
18884+
}
1888018885
// else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_MR) {
1888118886
// if (qs.model.hparams.n_vocab >= 127999 && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2))
1888218887
// new_type = GGML_TYPE_IQ3_S;
@@ -19936,10 +19941,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1993619941
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
1993719942
case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break;
1993819943
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
19944+
case LLAMA_FTYPE_MOSTLY_IQ4_XXSR:
19945+
case LLAMA_FTYPE_MOSTLY_IQ4_XSR:
19946+
case LLAMA_FTYPE_MOSTLY_IQ4_MR:
19947+
case LLAMA_FTYPE_MOSTLY_IQ4_LR:
1993919948
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
19940-
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: default_type = GGML_TYPE_IQ4_XS; break;
19941-
case LLAMA_FTYPE_MOSTLY_IQ4_MR: default_type = GGML_TYPE_IQ4_XS; break;
19942-
case LLAMA_FTYPE_MOSTLY_IQ4_LR: default_type = GGML_TYPE_IQ4_XS; break;
1994319949
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
1994419950
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1994519951
case LLAMA_FTYPE_MOSTLY_IQ3_ML: default_type = GGML_TYPE_IQ3_S; break;

0 commit comments

Comments
 (0)