Skip to content

Commit c7ecd4e

Browse files
authored
Legacy quants conversion schemes in convert_hf_to_gguf.py (#449)
* Legacy quants conversion schemes in convert_hf_to_gguf.py This, notably in order to make smaller conversions to generate an iMatrix file. `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0. `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0. Adapted from the following llama.cpp mainline PR : ggml-org/llama.cpp#9022 Original author @chentyjpm Also, 2 forgotten mentions of FTYPE IQ3_KL in llama.cpp file. * forgotten IQ5_KS case mention
1 parent a2c42f9 commit c7ecd4e

File tree

3 files changed

+50
-6
lines changed

3 files changed

+50
-6
lines changed

convert_hf_to_gguf.py

100755100644
Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,27 @@ def prepare_tensors(self):
306306
):
307307
data_qtype = gguf.GGMLQuantizationType.F32
308308

309+
if data_qtype is False and any(
310+
self.match_model_tensor_name(new_name, key, bid)
311+
for key in (
312+
gguf.MODEL_TENSOR.TOKEN_EMBD,
313+
gguf.MODEL_TENSOR.OUTPUT,
314+
gguf.MODEL_TENSOR.ATTN_V,
315+
gguf.MODEL_TENSOR.ATTN_K,
316+
)
317+
):
318+
if self.ftype in (
319+
gguf.LlamaFileType.MOSTLY_Q4_0,
320+
gguf.LlamaFileType.MOSTLY_Q4_1,
321+
):
322+
data_qtype = gguf.GGMLQuantizationType.Q5_0
323+
elif self.ftype in (
324+
gguf.LlamaFileType.MOSTLY_Q5_0,
325+
gguf.LlamaFileType.MOSTLY_Q5_1,
326+
# gguf.LlamaFileType.MOSTLY_Q6_0,
327+
):
328+
data_qtype = gguf.GGMLQuantizationType.Q8_0
329+
309330
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
310331
if isinstance(data_qtype, bool):
311332
if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -314,6 +335,16 @@ def prepare_tensors(self):
314335
data_qtype = gguf.GGMLQuantizationType.F16
315336
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
316337
data_qtype = gguf.GGMLQuantizationType.BF16
338+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
339+
data_qtype = gguf.GGMLQuantizationType.Q4_0
340+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1:
341+
data_qtype = gguf.GGMLQuantizationType.Q4_1
342+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0:
343+
data_qtype = gguf.GGMLQuantizationType.Q5_0
344+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
345+
data_qtype = gguf.GGMLQuantizationType.Q5_1
346+
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented?
347+
# data_qtype = gguf.GGMLQuantizationType.Q6_0
317348
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
318349
data_qtype = gguf.GGMLQuantizationType.Q8_0
319350
else:
@@ -387,6 +418,13 @@ def prepare_metadata(self, vocab_only: bool):
387418

388419
logger.info("Set model quantization version")
389420
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
421+
422+
logger.info("****************************************************************************************")
423+
logger.info("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`")
424+
logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0")
425+
logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0")
426+
logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.")
427+
logger.info("****************************************************************************************")
390428

391429
def write(self):
392430
self.prepare_tensors()
@@ -3375,7 +3413,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
33753413
if match and int(match.group(1)) >= block_count:
33763414
return []
33773415

3378-
33793416
# process the experts separately
33803417
if name.find("mlp.experts") != -1:
33813418
n_experts = self.hparams["n_routed_experts"]
@@ -4076,8 +4113,8 @@ def parse_args() -> argparse.Namespace:
40764113
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
40774114
)
40784115
parser.add_argument(
4079-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
4080-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
4116+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "auto"], default="f16",
4117+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
40814118
)
40824119
parser.add_argument(
40834120
"--bigendian", action="store_true",
@@ -4163,6 +4200,11 @@ def main() -> None:
41634200
"f32": gguf.LlamaFileType.ALL_F32,
41644201
"f16": gguf.LlamaFileType.MOSTLY_F16,
41654202
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
4203+
"q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
4204+
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
4205+
"q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
4206+
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
4207+
# "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
41664208
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
41674209
"auto": gguf.LlamaFileType.GUESSED,
41684210
}

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,7 @@ bool ggml_cuda_mmvq_type_supported(ggml_type src0_type) {
652652
case GGML_TYPE_IQ4_KSS:
653653
case GGML_TYPE_IQ2_KS:
654654
case GGML_TYPE_IQ5_K:
655+
case GGML_TYPE_IQ5_KS:
655656
case GGML_TYPE_IQ6_K:
656657
case GGML_TYPE_IQ3_S:
657658
return true;

src/llama.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18803,7 +18803,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1880318803
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4) {
1880418804
new_type = !qs.has_output ? GGML_TYPE_IQ4_K_R4 : GGML_TYPE_Q5_K_R4;
1880518805
}
18806-
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4 ||
18806+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL ||
18807+
ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4 ||
1880718808
ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS_R4) && !qs.has_output) {
1880818809
new_type = GGML_TYPE_IQ5_K;
1880918810
}
@@ -19165,8 +19166,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1916519166
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_K ||
1916619167
ftype == LLAMA_FTYPE_MOSTLY_IQ4_KSS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_KS_R4 ||
1916719168
ftype == LLAMA_FTYPE_MOSTLY_IQ5_KS || ftype == LLAMA_FTYPE_MOSTLY_IQ5_KS_R4 ||
19168-
ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_R4 ||
19169-
ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 ||
19169+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KL ||
19170+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS_R8 ||
1917019171
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT ||
1917119172
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_R4|| ftype == LLAMA_FTYPE_MOSTLY_IQ4_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_K_R4 ||
1917219173
ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4 || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S_R4) {

0 commit comments

Comments
 (0)