Skip to content

Commit 7b36e50

Browse files
committed
Legacy quants conversion schemes in convert_hf_to_gguf.py (LostRuins#449)
* Legacy quants conversion schemes in convert_hf_to_gguf.py This, notably in order to make smaller conversions to generate an iMatrix file. `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0. `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0. Adapted from the following llama.cpp mainline PR : ggml-org#9022 Original author @chentyjpm Also, 2 forgotten mentions of FTYPE IQ3_KL in llama.cpp file. * forgotten IQ5_KS case mention
1 parent fd61181 commit 7b36e50

File tree

1 file changed

+40
-7
lines changed

1 file changed

+40
-7
lines changed

convert_hf_to_gguf.py

100755100644
Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -321,15 +321,29 @@ def prepare_tensors(self):
321321
for key in (
322322
gguf.MODEL_TENSOR.TOKEN_EMBD,
323323
gguf.MODEL_TENSOR.OUTPUT,
324+
gguf.MODEL_TENSOR.ATTN_V,
325+
gguf.MODEL_TENSOR.ATTN_K,
324326
)
325327
):
326328
if self.ftype in (
329+
gguf.LlamaFileType.MOSTLY_Q4_0,
330+
gguf.LlamaFileType.MOSTLY_Q4_1,
331+
):
332+
data_qtype = gguf.GGMLQuantizationType.Q5_0
333+
elif self.ftype in (
334+
gguf.LlamaFileType.MOSTLY_Q5_0,
335+
gguf.LlamaFileType.MOSTLY_Q5_1,
336+
gguf.LlamaFileType.MOSTLY_Q6_0,
337+
):
338+
data_qtype = gguf.GGMLQuantizationType.Q8_0
339+
elif self.ftype in (
327340
gguf.LlamaFileType.MOSTLY_TQ1_0,
328341
gguf.LlamaFileType.MOSTLY_TQ2_0,
329342
):
330343
# TODO: use Q4_K and Q6_K
331344
data_qtype = gguf.GGMLQuantizationType.F16
332345

346+
333347
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
334348
if isinstance(data_qtype, bool):
335349
if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -338,6 +352,16 @@ def prepare_tensors(self):
338352
data_qtype = gguf.GGMLQuantizationType.F16
339353
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
340354
data_qtype = gguf.GGMLQuantizationType.BF16
355+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0:
356+
data_qtype = gguf.GGMLQuantizationType.Q4_0
357+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_1:
358+
data_qtype = gguf.GGMLQuantizationType.Q4_1
359+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_0:
360+
data_qtype = gguf.GGMLQuantizationType.Q5_0
361+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q5_1:
362+
data_qtype = gguf.GGMLQuantizationType.Q5_1
363+
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q6_0:
364+
data_qtype = gguf.GGMLQuantizationType.Q6_0
341365
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
342366
data_qtype = gguf.GGMLQuantizationType.Q8_0
343367
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
@@ -518,6 +542,15 @@ def prepare_metadata(self, vocab_only: bool):
518542

519543
def set_gguf_parameters(self):
520544
self.gguf_writer.add_block_count(self.block_count)
545+
logger.info("Set model quantization version")
546+
# self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
547+
548+
logger.info("****************************************************************************************")
549+
logger.info("** converting to `Q4_0`,`Q4_1`,`Q5_0`,`Q5_1` or `Q6_0` is not equiv to `llama-quantize`")
550+
logger.info("** `Q4_0`,`Q4_1` are here using embeddings, output, attn_k and attn_v in q5_0")
551+
logger.info("** `Q5_0`,`Q5_1` are here using embeddings, output, attn_k and attn_v in q8_0")
552+
logger.info("** This, in order to generate a small but reliable conversion to create an iMatrix file.")
553+
logger.info("****************************************************************************************")
521554

522555
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
523556
self.gguf_writer.add_context_length(n_ctx)
@@ -5110,11 +5143,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51105143
n_head = self.hparams["num_attention_heads"]
51115144
n_kv_head = self.hparams.get("num_key_value_heads")
51125145

5113-
if name.endswith(("q_proj.weight", "q_proj.bias")):
5114-
data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
5115-
if name.endswith(("k_proj.weight", "k_proj.bias")):
5116-
data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
5117-
51185146
# process the experts separately
51195147
if name.find("mlp.experts") != -1:
51205148
n_experts = self.hparams["n_routed_experts"]
@@ -6343,8 +6371,8 @@ def parse_args() -> argparse.Namespace:
63436371
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
63446372
)
63456373
parser.add_argument(
6346-
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
6347-
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
6374+
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "q6_0", "tq1_0", "tq2_0", "auto"], default="f16",
6375+
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, q4_0, q4_1, q5_0, q5_1, q6_0 for a smaller conversion to then create an iMatrix file for example, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
63486376
)
63496377
parser.add_argument(
63506378
"--bigendian", action="store_true",
@@ -6473,6 +6501,11 @@ def main() -> None:
64736501
"f32": gguf.LlamaFileType.ALL_F32,
64746502
"f16": gguf.LlamaFileType.MOSTLY_F16,
64756503
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
6504+
"q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
6505+
"q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
6506+
"q5_0": gguf.LlamaFileType.MOSTLY_Q5_0,
6507+
"q5_1": gguf.LlamaFileType.MOSTLY_Q5_1,
6508+
"q6_0": gguf.LlamaFileType.MOSTLY_Q6_0,
64766509
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
64776510
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
64786511
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,

0 commit comments

Comments
 (0)