ggml-org
diff --git a/‎.gitignore
Lines changed: 6 additions & 5 deletions b/‎.gitignore
Lines changed: 6 additions & 5 deletions
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎convert-hf-to-gguf.py renamed to ‎convert_hf_to_gguf.py b/‎convert-hf-to-gguf.py renamed to ‎convert_hf_to_gguf.py
diff --git a/‎convert-hf-to-gguf-update.py renamed to ‎convert_hf_to_gguf_update.py
Lines changed: 128 additions & 30 deletions b/‎convert-hf-to-gguf-update.py renamed to ‎convert_hf_to_gguf_update.py
Lines changed: 128 additions & 30 deletions
diff --git a/‎convert-llama-ggml-to-gguf.py renamed to ‎convert_llama_ggml_to_gguf.py b/‎convert-llama-ggml-to-gguf.py renamed to ‎convert_llama_ggml_to_gguf.py
diff --git a/‎convert_lora_to_ggml.py
Lines changed: 149 additions & 0 deletions b/‎convert_lora_to_ggml.py
Lines changed: 149 additions & 0 deletions
@@ -98,13 +98,14 @@ examples/server/*.mjs.hpp
 
 # Python
 
-__pycache__
-.venv
-/Pipfile
-dist
-poetry.lock
+/.venv
+/__pycache__/
+*/poetry.lock
 poetry.toml
 
+# Nix
+/result
+
 # Test binaries
 /tests/test-backend-ops
 /tests/test-double-float
 
@@ -49,7 +49,7 @@ class TOKENIZER_TYPE(IntEnum):
 
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+chktxt = "\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL"
 
 if len(sys.argv) == 2:
     token = sys.argv[1]
@@ -63,29 +63,121 @@ class TOKENIZER_TYPE(IntEnum):
 
 # TODO: add models here, base models preferred
 models = [
-    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
-    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
-    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
-    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
-    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
-    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
-    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
-    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
-    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
-    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
-    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
-    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
-    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
-    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
-    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
-    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
+    {
+        "name": "llama-spm",
+        "tokt": TOKENIZER_TYPE.SPM,
+        "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf",
+    },
+    {
+        "name": "llama-bpe",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",
+    },
+    {
+        "name": "phi-3",
+        "tokt": TOKENIZER_TYPE.SPM,
+        "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
+    },
+    {
+        "name": "deepseek-llm",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base",
+    },
+    {
+        "name": "deepseek-coder",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base",
+    },
+    {
+        "name": "falcon",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/tiiuae/falcon-7b",
+    },
+    {
+        "name": "bert-bge",
+        "tokt": TOKENIZER_TYPE.WPM,
+        "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5",
+    },
+    {
+        "name": "mpt",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/mosaicml/mpt-7b",
+    },
+    {
+        "name": "starcoder",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/bigcode/starcoder2-3b",
+    },
+    {
+        "name": "gpt-2",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/openai-community/gpt2",
+    },
+    {
+        "name": "stablelm2",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b",
+    },
+    {
+        "name": "refact",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base",
+    },
+    {
+        "name": "command-r",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01",
+    },
+    {
+        "name": "qwen2",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/Qwen/Qwen1.5-7B",
+    },
+    {
+        "name": "olmo",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf",
+    },
+    {
+        "name": "dbrx",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/databricks/dbrx-base",
+    },
+    {
+        "name": "jina-v2-en",
+        "tokt": TOKENIZER_TYPE.WPM,
+        "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en",
+    },  # WPM!
+    {
+        "name": "jina-v2-es",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es",
+    },
+    {
+        "name": "jina-v2-de",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de",
+    },
+    {
+        "name": "smaug-bpe",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct",
+    },
+    {
+        "name": "poro-chat",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat",
+    },
+    {
+        "name": "jina-v2-code",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code",
+    },
+    {
+        "name": "viking",
+        "tokt": TOKENIZER_TYPE.BPE,
+        "repo": "https://huggingface.co/LumiOpen/Viking-7B",
+    },  # Also used for Viking 13B and 33B
 ]
 
 
@@ -94,7 +186,7 @@ def download_file_with_auth(url, token, save_path):
     response = sess.get(url, headers=headers)
     response.raise_for_status()
     os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as f:
+    with open(save_path, "wb") as f:
         f.write(response.content)
     logger.info(f"File {save_path} downloaded successfully")
 
@@ -144,7 +236,9 @@ def download_model(model):
     try:
         tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
     except OSError as e:
-        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
+        logger.error(
+            f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}"
+        )
         continue  # Skip to the next model if the tokenizer can't be loaded
 
     chktok = tokenizer.encode(chktxt)
@@ -164,13 +258,15 @@ def download_model(model):
         pre_tokenizer = cfg["pre_tokenizer"]
         logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
         if "ignore_merges" in cfg["model"]:
-            logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
+            logger.info(
+                "ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4)
+            )
 
     logger.info("")
 
-    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
+    src_ifs += f'        if chkhsh == "{chkhsh}":\n'
     src_ifs += f"            # ref: {model['repo']}\n"
-    src_ifs += f"            res = \"{name}\"\n"
+    src_ifs += f'            res = "{name}"\n'
 
 src_func = f"""
     def get_vocab_base_pre(self, tokenizer) -> str:
@@ -326,6 +422,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 for model in models:
     name = model["name"]
 
-    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+    print(
+        f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only"
+    )  # noqa: NP100
 
 logger.info("\n")
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any, BinaryIO, Sequence
+
+import numpy as np
+import torch
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
+
+
+def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
+    fout.write(b"ggla"[::-1])  # magic (ggml lora)
+    fout.write(struct.pack("i", 1))  # file version
+    fout.write(struct.pack("i", params["r"]))
+    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
+    # but some models ship a float value instead
+    # let's convert to int, but fail if lossless conversion is not possible
+    assert (
+        int(params["lora_alpha"]) == params["lora_alpha"]
+    ), "cannot convert float to int losslessly"
+    fout.write(struct.pack("i", int(params["lora_alpha"])))
+
+
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
+    sname = name.encode("utf-8")
+    fout.write(
+        struct.pack(
+            "iii",
+            len(shape),
+            len(sname),
+            NUMPY_TYPE_TO_FTYPE[data_type.name],
+        )
+    )
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+    fout.seek((fout.tell() + 31) & -32)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print(f"Usage: python {sys.argv[0]} <path> [arch]")
+        print(
+            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+        )
+        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
+        sys.exit(1)
+
+    input_json = os.path.join(sys.argv[1], "adapter_config.json")
+    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+
+    if os.path.exists(input_model):
+        model = torch.load(input_model, map_location="cpu")
+    else:
+        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+        model = load_file(input_model, device="cpu")
+
+    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+
+    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
+        print(f"Error: unsupported architecture {arch_name}")
+        sys.exit(1)
+
+    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
+    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
+
+    with open(input_json, "r") as f:
+        params = json.load(f)
+
+    if params["peft_type"] != "LORA":
+        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+        sys.exit(1)
+
+    if params["fan_in_fan_out"] is True:
+        print("Error: param fan_in_fan_out is not supported")
+        sys.exit(1)
+
+    if params["bias"] is not None and params["bias"] != "none":
+        print("Error: param bias is not supported")
+        sys.exit(1)
+
+    # TODO: these seem to be layers that have been trained but without lora.
+    # doesn't seem widely used but eventually should be supported
+    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+        print("Error: param modules_to_save is not supported")
+        sys.exit(1)
+
+    with open(output_path, "wb") as fout:
+        fout.truncate()
+
+        write_file_header(fout, params)
+        for k, v in model.items():
+            orig_k = k
+            if k.endswith(".default.weight"):
+                k = k.replace(".default.weight", ".weight")
+            if k in ["llama_proj.weight", "llama_proj.bias"]:
+                continue
+            if k.endswith("lora_A.weight"):
+                if v.dtype != torch.float16 and v.dtype != torch.float32:
+                    v = v.float()
+                v = v.T
+            else:
+                v = v.float()
+
+            t = v.detach().numpy()
+
+            prefix = "base_model.model."
+            if k.startswith(prefix):
+                k = k[len(prefix) :]
+
+            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
+            if k.endswith(lora_suffixes):
+                suffix = k[-len(lora_suffixes[0]):]
+                k = k[: -len(lora_suffixes[0])]
+            else:
+                print(f"Error: unrecognized tensor name {orig_k}")
+                sys.exit(1)
+
+            tname = name_map.get_name(k)
+            if tname is None:
+                print(f"Error: could not map tensor name {orig_k}")
+                print(" Note: the arch parameter must be specified if the model is not llama")
+                sys.exit(1)
+
+            if suffix == ".lora_A.weight":
+                tname += ".weight.loraA"
+            elif suffix == ".lora_B.weight":
+                tname += ".weight.loraB"
+            else:
+                assert False
+
+            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+            write_tensor_header(fout, tname, t.shape, t.dtype)
+            t.tofile(fout)
+
+    print(f"Converted {input_json} and {input_model} to {output_path}")
+