convert-hf-to-gguf.py: add metadata override

mofosyne · mofosyne · commit c704442d7af6 · 2024-05-24T04:27:11.000+10:00
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -13,6 +13,7 @@
 from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
+from dataclasses import dataclass
 
 import math
 import numpy as np
@@ -30,6 +31,42 @@
 logger = logging.getLogger("hf-to-gguf")
 
 
+@dataclass
+class Metadata:
+    name: Optional[str] = None
+    author: Optional[str] = None
+    version: Optional[str] = None
+    url: Optional[str] = None
+    description: Optional[str] = None
+    licence: Optional[str] = None
+    source_url: Optional[str] = None
+    source_hf_repo: Optional[str] = None
+
+    @staticmethod
+    def load(metadata_path: Path) -> Metadata:
+        if metadata_path is None or not metadata_path.exists():
+            return Metadata()
+
+        with open(metadata_path, 'r') as file:
+            data = json.load(file)
+
+        # Create a new Metadata instance
+        metadata = Metadata()
+
+        # Assigning values to Metadata attributes if they exist in the JSON file
+        # This is based on LLM_KV_NAMES mapping in llama.cpp
+        metadata.name = data.get("general.name")
+        metadata.author = data.get("general.author")
+        metadata.version = data.get("general.version")
+        metadata.url = data.get("general.url")
+        metadata.description = data.get("general.description")
+        metadata.license = data.get("general.license")
+        metadata.source_url = data.get("general.source.url")
+        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
+
+        return metadata
+
+
 ###### MODEL DEFINITIONS ######
 
 class SentencePieceTokenTypes(IntEnum):
@@ -62,11 +99,12 @@ class Model:
     fname_out: Path
     fname_default: Path
     gguf_writer: gguf.GGUFWriter
+    metadata: Metadata
 
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, metadata: Metadata):
         if type(self) is Model:
             raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
         self.dir_model = dir_model
@@ -83,6 +121,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
         self.tensor_names = None
+        self.metadata = metadata
         if self.ftype == gguf.LlamaFileType.GUESSED:
             # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
             _, first_tensor = next(self.get_tensors())
@@ -200,8 +239,34 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
             raise ValueError(f"Can not map tensor {name!r}")
         return new_name
 
+    def set_gguf_meta_model(self):
+
+        # Metadata About The Model And Its Provenence
+        name = "LLaMA"
+        if self.metadata is not None and self.metadata.name is not None:
+            name = metadata.name
+        elif self.dir_model is not None:
+            name = self.dir_model.name
+
+        self.gguf_writer.add_name(name)
+
+        if self.metadata is not None:
+            if self.metadata.author is not None:
+                self.gguf_writer.add_author(self.metadata.author)
+            if self.metadata.version is not None:
+                self.gguf_writer.add_version(self.metadata.version)
+            if self.metadata.url is not None:
+                self.gguf_writer.add_url(self.metadata.url)
+            if self.metadata.description is not None:
+                self.gguf_writer.add_description(self.metadata.description)
+            if self.metadata.licence is not None:
+                self.gguf_writer.add_licence(self.metadata.licence)
+            if self.metadata.source_url is not None:
+                self.gguf_writer.add_source_url(self.metadata.source_url)
+            if self.metadata.source_hf_repo is not None:
+                self.gguf_writer.add_source_hf_repo(self.metadata.source_hf_repo)
+
     def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
         self.gguf_writer.add_block_count(self.block_count)
 
         if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@@ -2588,6 +2653,10 @@ def parse_args() -> argparse.Namespace:
         "--verbose", action="store_true",
         help="increase output verbosity",
     )
+    parser.add_argument(
+        "--metadata", type=Path,
+        help="Specify the path for a metadata file"
+    )
     parser.add_argument(
         "--get-outfile", action="store_true",
         help="get calculated default outfile name"
@@ -2607,6 +2676,7 @@ def main() -> None:
     else:
         logging.basicConfig(level=logging.INFO)
 
+    metadata = Metadata.load(args.metadata)
     dir_model = args.model
 
     if args.awq_path:
@@ -2642,12 +2712,15 @@ def main() -> None:
         encodingScheme = ftype_map[args.outtype]
         model_architecture = hparams["architectures"][0]
         model_class = Model.from_model_architecture(model_architecture)
-        model_instance = model_class(dir_model, encodingScheme, args.outfile, args.bigendian, args.use_temp_file, args.no_lazy)
+        model_instance = model_class(dir_model, encodingScheme, args.outfile, args.bigendian, args.use_temp_file, args.no_lazy, metadata)
 
         if args.get_outfile:
             print(f"{model_instance.fname_default}") # noqa: NP100
             return
 
+        logger.info("Set meta model")
+        model_instance.set_gguf_meta_model()
+
         logger.info("Set model parameters")
         model_instance.set_gguf_parameters()