Skip to content

Commit 3870164

Browse files
committed
convert-hf : allow unusual model part names
For example, loading `model-00001-of-00001.safetensors` now works. * convert-hf : fix stacking MoE expert tensors `torch.stack` and `torch.cat` don't do the same thing. * convert-hf : fix Mamba conversion Tested to work even with a SentencePiece-based tokenizer.
1 parent 56f60f5 commit 3870164

File tree

1 file changed

+30
-41
lines changed

1 file changed

+30
-41
lines changed

convert-hf-to-gguf.py

Lines changed: 30 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,8 @@ class Model(Protocol):
4949
is_big_endian: bool
5050
endianess: gguf.GGUFEndian
5151
use_temp_file: bool
52+
part_names: list[str]
5253
is_safetensors: bool
53-
num_parts: int
54-
part_names: Iterable[str]
5554
hparams: dict[str, Any]
5655
gguf_writer: gguf.GGUFWriter
5756
block_count: int
@@ -67,9 +66,10 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian:
6766
self.is_big_endian = is_big_endian
6867
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
6968
self.use_temp_file = use_temp_file
70-
self.is_safetensors = self._is_model_safetensors()
71-
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
72-
self.part_names = self._get_part_names()
69+
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
70+
self.is_safetensors = len(self.part_names) > 0
71+
if not self.is_safetensors:
72+
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
7373
self.hparams = Model.load_hparams(self.dir_model)
7474
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
7575
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
@@ -109,7 +109,7 @@ def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suf
109109
sys.exit()
110110
if "{bid}" in name:
111111
assert bid is not None
112-
name = name.format(bid)
112+
name = name.format(bid=bid)
113113
return name + suffix
114114

115115
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
@@ -228,13 +228,13 @@ def write_vocab(self):
228228
self.gguf_writer.close()
229229

230230
@staticmethod
231-
def count_model_parts(dir_model: Path, prefix: str) -> int:
232-
num_parts = 0
231+
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
232+
part_names: list[str] = []
233233
for filename in os.listdir(dir_model):
234-
if filename.endswith(prefix):
235-
num_parts += 1
234+
if filename.endswith(suffix):
235+
part_names.append(filename)
236236

237-
return num_parts
237+
return part_names
238238

239239
@staticmethod
240240
def load_hparams(dir_model):
@@ -258,19 +258,6 @@ def from_model_architecture(cls, arch):
258258
except KeyError:
259259
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
260260

261-
def _is_model_safetensors(self) -> bool:
262-
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
263-
264-
def _get_part_names(self) -> Iterable[str]:
265-
if self.is_safetensors:
266-
if self.num_parts == 1: # there's only one .safetensors file
267-
return ("model.safetensors",)
268-
return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
269-
270-
if self.num_parts == 1: # there's only one .bin file
271-
return ("pytorch_model.bin",)
272-
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
273-
274261
# used for GPT-2 BPE and WordPiece vocabs
275262
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
276263
tokens: list[str] = []
@@ -446,7 +433,7 @@ def _set_vocab_sentencepiece(self):
446433
raise FileNotFoundError(f"File not found: {tokenizer_path}")
447434

448435
tokenizer = SentencePieceProcessor()
449-
tokenizer.LoadFromFile(tokenizer_path)
436+
tokenizer.LoadFromFile(str(tokenizer_path))
450437

451438
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
452439

@@ -1120,7 +1107,7 @@ def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_
11201107
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
11211108
datas.append(norms[ename])
11221109
del norms[ename]
1123-
data_torch = torch.cat(datas, dim=0)
1110+
data_torch = torch.stack(datas, dim=0)
11241111

11251112
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
11261113
new_name = self.map_tensor_name(merged_name)
@@ -1204,7 +1191,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
12041191
assert bid is not None
12051192

12061193
if self._experts is None:
1207-
self._experts = [{} for _ in range(n_experts)]
1194+
self._experts = [{} for _ in range(self.block_count)]
12081195

12091196
self._experts[bid][name] = data_torch
12101197

@@ -1220,7 +1207,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
12201207
datas.append(self._experts[bid][ename])
12211208
del self._experts[bid][ename]
12221209

1223-
data_torch = torch.cat(datas, dim=0)
1210+
data_torch = torch.stack(datas, dim=0)
12241211

12251212
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
12261213

@@ -1267,7 +1254,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
12671254
assert bid is not None
12681255

12691256
if self._experts is None:
1270-
self._experts = [{} for _ in range(n_experts)]
1257+
self._experts = [{} for _ in range(self.block_count)]
12711258

12721259
self._experts[bid][name] = data_torch
12731260

@@ -1283,7 +1270,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
12831270
datas.append(self._experts[bid][ename])
12841271
del self._experts[bid][ename]
12851272

1286-
data_torch = torch.cat(datas, dim=0)
1273+
data_torch = torch.stack(datas, dim=0)
12871274

12881275
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
12891276

@@ -1484,7 +1471,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
14841471
assert bid is not None
14851472

14861473
if self._experts is None:
1487-
self._experts = [{} for _ in range(n_experts)]
1474+
self._experts = [{} for _ in range(self.block_count)]
14881475

14891476
self._experts[bid][name] = data_torch
14901477

@@ -1500,7 +1487,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
15001487
datas.append(self._experts[bid][ename])
15011488
del self._experts[bid][ename]
15021489

1503-
data_torch = torch.cat(datas, dim=0)
1490+
data_torch = torch.stack(datas, dim=0)
15041491

15051492
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
15061493

@@ -1604,7 +1591,7 @@ def set_vocab(self):
16041591
sys.exit(1)
16051592

16061593
tokenizer = SentencePieceProcessor()
1607-
tokenizer.LoadFromFile(tokenizer_path)
1594+
tokenizer.LoadFromFile(str(tokenizer_path))
16081595

16091596
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
16101597

@@ -1786,7 +1773,7 @@ def set_vocab(self):
17861773
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
17871774

17881775
tokenizer = SentencePieceProcessor()
1789-
tokenizer.LoadFromFile(tokenizer_path)
1776+
tokenizer.LoadFromFile(str(tokenizer_path))
17901777
tokenizer.serialized_model_proto
17911778

17921779
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
@@ -2171,13 +2158,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
21712158
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
21722159
del n_dims # unused
21732160

2174-
return new_name in (self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
2175-
gguf.MODEL_TENSOR.SSM_CONV1D,
2176-
gguf.MODEL_TENSOR.SSM_X,
2177-
gguf.MODEL_TENSOR.SSM_DT,
2178-
gguf.MODEL_TENSOR.SSM_A,
2179-
gguf.MODEL_TENSOR.SSM_D,
2180-
])
2161+
return bid is not None and new_name in (
2162+
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
2163+
gguf.MODEL_TENSOR.SSM_CONV1D,
2164+
gguf.MODEL_TENSOR.SSM_X,
2165+
gguf.MODEL_TENSOR.SSM_DT,
2166+
gguf.MODEL_TENSOR.SSM_A,
2167+
gguf.MODEL_TENSOR.SSM_D,
2168+
]
2169+
)
21812170

21822171

21832172
@Model.register("CohereForCausalLM")

0 commit comments

Comments
 (0)