Skip to content

convert-*.py: autogenerate general.uuid if missing #8565

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import os
import re
import sys
import uuid
import hashlib
from enum import IntEnum
from pathlib import Path
from hashlib import sha256
Expand Down Expand Up @@ -62,6 +64,7 @@ class Model:
gguf_writer: gguf.GGUFWriter
model_name: str | None
metadata_override: Path | None
generated_source_uuid: str | None
dir_model_card: Path

# subclasses should define this!
Expand Down Expand Up @@ -262,9 +265,17 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i
return False

def prepare_tensors(self):

uuidv5_sha1 = hashlib.sha1()
uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes)

max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")

for name, data_torch in self.get_tensors():

uuidv5_data_buffer: np.ndarray = data_torch.numpy()
uuidv5_sha1.update(uuidv5_data_buffer.data.tobytes('C'))

# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
continue
Expand Down Expand Up @@ -344,6 +355,9 @@ def prepare_tensors(self):

self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)

# Upon missing source model uuid, generate uuid based on source tensor content
self.generated_source_uuid = str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5))

def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.MODEL)

Expand Down Expand Up @@ -382,6 +396,13 @@ def prepare_metadata(self, vocab_only: bool):
# Process templated file name with the output ftype, useful with the "auto" ftype
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)

if not vocab_only:
if self.metadata.source_uuid is not None:
logger.info(f"Source UUID present: {self.metadata.source_uuid}")
elif self.generated_source_uuid is not None:
logger.info(f"Source UUID missing. Using generated source uuid: {self.generated_source_uuid}")
self.metadata.source_uuid = self.generated_source_uuid

self.set_type()

logger.info("Set meta model")
Expand Down Expand Up @@ -3484,6 +3505,10 @@ class LazyTorchTensor(gguf.LazyBase):
_dtype_map: dict[torch.dtype, type] = {
torch.float16: np.float16,
torch.float32: np.float32,
torch.float64: np.float64,

# No direct mapping avaliable. Cast upwards to avoid loss of precision
torch.bfloat16: np.float32,
}

# used for safetensors slices
Expand Down
Loading