Skip to content

Commit 108e53c

Browse files
authored
llama : add support for GPT2, Bloom and CodeShell tied word embeddings (#12456)
* Add support for GPT2, Bloom and CodeShell tied word embeddings * Deduplicate tied word embeddings weights * Workaround for incorrect weight map It appears transformer.wte.weight is in the weight map even though the weights are not there, remove it if output weights are encountered first. * check++ * fatfingers--
1 parent a686171 commit 108e53c

File tree

2 files changed

+34
-24
lines changed

2 files changed

+34
-24
lines changed

convert_hf_to_gguf.py

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
180180
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
181181
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
182182
if len(extra) == 0 and len(missing_files) > 0:
183-
raise ValueError(f"Missing or incomplete model files: {missing_files}")
183+
raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
184+
f"Missing tensors: {missing}")
184185
else:
185186
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
186187
f"Missing tensors: {missing}\n"
@@ -1099,13 +1100,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
10991100

11001101
tensors.append((self.map_tensor_name(name), data_torch))
11011102

1102-
if name == "word_embeddings.weight":
1103-
assert self.tensor_names is not None
1104-
1105-
# TODO: tie them at runtime, don't duplicate in the model file
1106-
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
1107-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
1108-
11091103
return tensors
11101104

11111105

@@ -2423,10 +2417,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
24232417

24242418
tensors.append((new_name, data_torch))
24252419

2426-
# note: GPT2 output is tied to (same as) wte in original model
2427-
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2428-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2429-
24302420
return tensors
24312421

24322422

@@ -2756,21 +2746,26 @@ def set_gguf_parameters(self):
27562746
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
27572747
self.gguf_writer.add_rope_scaling_factor(1.0)
27582748

2749+
_has_tok_embd = False
2750+
27592751
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
27602752
del bid # unused
27612753

2762-
new_name = self.map_tensor_name(name)
2763-
2764-
tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
2754+
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
2755+
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
27652756

2766-
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
2767-
assert self.tensor_names is not None
2757+
new_name = self.map_tensor_name(name)
27682758

2769-
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
2770-
# copy tok_embd.weight to output.weight
2771-
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
2759+
# assuming token_embd.weight is seen before output.weight
2760+
if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
2761+
# even though the tensor file(s) does not contain the word embeddings they are still in the weight map
2762+
if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
2763+
logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
2764+
self.tensor_names.remove("transformer.wte.weight")
2765+
elif new_name == tok_embd_name:
2766+
self._has_tok_embd = True
27722767

2773-
return tensors
2768+
return [(new_name, data_torch)]
27742769

27752770

27762771
@Model.register("InternLM2ForCausalLM")

src/llama-model.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,7 +2020,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
20202020
// output
20212021
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
20222022
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2023-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2023+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2024+
2025+
// if output is NULL, init from the input tok embed
2026+
if (output == NULL) {
2027+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2028+
}
20242029

20252030
for (int i = 0; i < n_layer; ++i) {
20262031
auto & layer = layers[i];
@@ -2381,7 +2386,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
23812386
// output
23822387
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
23832388
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
2384-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
2389+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2390+
2391+
// if output is NULL, init from the input tok embed
2392+
if (output == NULL) {
2393+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2394+
}
23852395

23862396
for (int i = 0; i < n_layer; ++i) {
23872397
auto & layer = layers[i];
@@ -2407,7 +2417,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
24072417
} break;
24082418
case LLM_ARCH_CODESHELL:
24092419
{
2410-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2420+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
2421+
2422+
// if tok embd is NULL, init from output
2423+
if (tok_embd == NULL) {
2424+
tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
2425+
}
24112426

24122427
// output
24132428
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);

0 commit comments

Comments
 (0)