Skip to content

Commit 0049c2f

Browse files
committed
Legacy quants conversion schemes in convert_hf_to_gguf.py (LostRuins#449)
* Legacy quants conversion schemes in convert_hf_to_gguf.py This, notably in order to make smaller conversions to generate an iMatrix file. `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0. `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0. Adapted from the following llama.cpp mainline PR : ggml-org#9022 Original author @chentyjpm Also, 2 forgotten mentions of FTYPE IQ3_KL in llama.cpp file. * forgotten IQ5_KS case mention
1 parent 78dd772 commit 0049c2f

File tree

1 file changed

+37
-2
lines changed

1 file changed

+37
-2
lines changed

convert_hf_to_gguf.py

100755100644
Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,9 +321,22 @@ def prepare_tensors(self):
321321
for key in (
322322
gguf.MODEL_TENSOR.TOKEN_EMBD,
323323
gguf.MODEL_TENSOR.OUTPUT,
324+
gguf.MODEL_TENSOR.ATTN_V,
325+
gguf.MODEL_TENSOR.ATTN_K,
324326
)
325327
):
326328
if self.ftype in (
329+
gguf.LlamaFileType.MOSTLY_Q4_0,
330+
gguf.LlamaFileType.MOSTLY_Q4_1,
331+
):
332+
data_qtype = gguf.GGMLQuantizationType.Q5_0
333+
elif self.ftype in (
334+
gguf.LlamaFileType.MOSTLY_Q5_0,
335+
gguf.LlamaFileType.MOSTLY_Q5_1,
336+
# gguf.LlamaFileType.MOSTLY_Q6_0,
337+
):
338+
data_qtype = gguf.GGMLQuantizationType.Q8_0
339+
elif self.ftype in (
327340
gguf.LlamaFileType.MOSTLY_TQ1_0,
328341
gguf.LlamaFileType.MOSTLY_TQ2_0,
329342
):
@@ -338,6 +351,16 @@ def prepare_tensors(self):
338351
data_qtype = gguf.GGMLQuantizationType.F16
339352
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
340353
data_qtype = gguf.GGMLQuantizationType.BF16
354+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
355+
data_qtype = gguf.GGMLQuantizationType.Q4_0
356+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1:
357+
data_qtype = gguf.GGMLQuantizationType.Q4_1
358+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0:
359+
data_qtype = gguf.GGMLQuantizationType.Q5_0
360+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
361+
data_qtype = gguf.GGMLQuantizationType.Q5_1
362+
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented?
363+
# data_qtype = gguf.GGMLQuantizationType.Q6_0
341364
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
342365
data_qtype = gguf.GGMLQuantizationType.Q8_0
343366
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
@@ -518,6 +541,13 @@ def prepare_metadata(self, vocab_only: bool):
518541

519542
def set_gguf_parameters(self):
520543
self.gguf_writer.add_block_count(self.block_count)
544+
545+
logger.info("****************************************************************************************")
546+
logger.info("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`")
547+
logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0")
548+
logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0")
549+
logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.")
550+
logger.info("****************************************************************************************")
521551

522552
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
523553
self.gguf_writer.add_context_length(n_ctx)
@@ -6343,8 +6373,8 @@ def parse_args() -> argparse.Namespace:
63436373
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
63446374
)
63456375
parser.add_argument(
6346-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
6347-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
6376+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "tq1_0", "tq2_0", "auto"], default="f16",
6377+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
63486378
)
63496379
parser.add_argument(
63506380
"--bigendian", action="store_true",
@@ -6473,6 +6503,11 @@ def main() -> None:
64736503
"f32": gguf.LlamaFileType.ALL_F32,
64746504
"f16": gguf.LlamaFileType.MOSTLY_F16,
64756505
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
6506+
"q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
6507+
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
6508+
"q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
6509+
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
6510+
# "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
64766511
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
64776512
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
64786513
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,

0 commit comments

Comments
 (0)