Skip to content

Commit 0d51bc0

Browse files
committed
Legacy quants conversion schemes in convert_hf_to_gguf.py (LostRuins#449)
* Legacy quants conversion schemes in convert_hf_to_gguf.py This, notably in order to make smaller conversions to generate an iMatrix file. `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0. `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0. Adapted from the following llama.cpp mainline PR : ggml-org#9022 Original author @chentyjpm Also, 2 forgotten mentions of FTYPE IQ3_KL in llama.cpp file. * forgotten IQ5_KS case mention
1 parent fd61181 commit 0d51bc0

File tree

1 file changed

+39
-2
lines changed

1 file changed

+39
-2
lines changed

convert_hf_to_gguf.py

100755100644
Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,9 +321,22 @@ def prepare_tensors(self):
321321
for key in (
322322
gguf.MODEL_TENSOR.TOKEN_EMBD,
323323
gguf.MODEL_TENSOR.OUTPUT,
324+
gguf.MODEL_TENSOR.ATTN_V,
325+
gguf.MODEL_TENSOR.ATTN_K,
324326
)
325327
):
326328
if self.ftype in (
329+
gguf.LlamaFileType.MOSTLY_Q4_0,
330+
gguf.LlamaFileType.MOSTLY_Q4_1,
331+
):
332+
data_qtype = gguf.GGMLQuantizationType.Q5_0
333+
elif self.ftype in (
334+
gguf.LlamaFileType.MOSTLY_Q5_0,
335+
gguf.LlamaFileType.MOSTLY_Q5_1,
336+
# gguf.LlamaFileType.MOSTLY_Q6_0,
337+
):
338+
data_qtype = gguf.GGMLQuantizationType.Q8_0
339+
elif self.ftype in (
327340
gguf.LlamaFileType.MOSTLY_TQ1_0,
328341
gguf.LlamaFileType.MOSTLY_TQ2_0,
329342
):
@@ -338,6 +351,16 @@ def prepare_tensors(self):
338351
data_qtype = gguf.GGMLQuantizationType.F16
339352
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
340353
data_qtype = gguf.GGMLQuantizationType.BF16
354+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
355+
data_qtype = gguf.GGMLQuantizationType.Q4_0
356+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1:
357+
data_qtype = gguf.GGMLQuantizationType.Q4_1
358+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0:
359+
data_qtype = gguf.GGMLQuantizationType.Q5_0
360+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
361+
data_qtype = gguf.GGMLQuantizationType.Q5_1
362+
# elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0: // To be implemented?
363+
# data_qtype = gguf.GGMLQuantizationType.Q6_0
341364
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
342365
data_qtype = gguf.GGMLQuantizationType.Q8_0
343366
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
@@ -518,6 +541,15 @@ def prepare_metadata(self, vocab_only: bool):
518541

519542
def set_gguf_parameters(self):
520543
self.gguf_writer.add_block_count(self.block_count)
544+
logger.info("Set model quantization version")
545+
self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
546+
547+
logger.info("****************************************************************************************")
548+
logger.info("** quantizing to `Q4_0`,`Q4_1`,`Q5_0`, or `Q5_1`is not equiv to using `llama-quantize`")
549+
logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0")
550+
logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0")
551+
logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.")
552+
logger.info("****************************************************************************************")
521553

522554
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
523555
self.gguf_writer.add_context_length(n_ctx)
@@ -6343,8 +6375,8 @@ def parse_args() -> argparse.Namespace:
63436375
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
63446376
)
63456377
parser.add_argument(
6346-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
6347-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
6378+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "tq1_0", "tq2_0", "auto"], default="f16",
6379+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, q4_0, q4_1, q5_0, q5_1 for a smaller conversion to then create an iMatrix file for example, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
63486380
)
63496381
parser.add_argument(
63506382
"--bigendian", action="store_true",
@@ -6473,6 +6505,11 @@ def main() -> None:
64736505
"f32": gguf.LlamaFileType.ALL_F32,
64746506
"f16": gguf.LlamaFileType.MOSTLY_F16,
64756507
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
6508+
"q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
6509+
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
6510+
"q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
6511+
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
6512+
# "q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
64766513
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
64776514
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
64786515
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,

0 commit comments

Comments
 (0)