Skip to content

Commit ea9c8e1

Browse files
authored
llama : add support for Nomic Embed (#5468)
1 parent c4e6dd5 commit ea9c8e1

File tree

4 files changed

+273
-113
lines changed

4 files changed

+273
-113
lines changed

convert-hf-to-gguf.py

Lines changed: 78 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import sys
1111
from enum import IntEnum
1212
from pathlib import Path
13-
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
13+
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
1414

1515
import numpy as np
1616
import torch
@@ -25,15 +25,6 @@
2525
from convert import HfVocab
2626

2727

28-
# check for any of the given keys in the dictionary and return the value of the first key found
29-
def get_key_opts(d, keys):
30-
for k in keys:
31-
if k in d:
32-
return d[k]
33-
print(f"Could not find any of {keys}")
34-
sys.exit()
35-
36-
3728
###### MODEL DEFINITIONS ######
3829

3930
class SentencePieceTokenTypes(IntEnum):
@@ -58,6 +49,15 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian:
5849
self.hparams = Model.load_hparams(self.dir_model)
5950
self.model_arch = self._get_model_architecture()
6051
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
52+
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
53+
54+
def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
55+
key = next((k for k in keys if k in self.hparams), None)
56+
if key is not None:
57+
return self.hparams[key]
58+
if optional:
59+
return None
60+
raise KeyError(f"could not find any of: {keys}")
6161

6262
def set_vocab(self):
6363
self._set_vocab_gpt2()
@@ -79,28 +79,33 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
7979

8080
def set_gguf_parameters(self):
8181
self.gguf_writer.add_name(self.dir_model.name)
82-
self.gguf_writer.add_block_count(self.hparams.get(
83-
"n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
84-
))
85-
if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
82+
self.gguf_writer.add_block_count(self.block_count)
83+
84+
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
8685
self.gguf_writer.add_context_length(n_ctx)
87-
if (n_embd := self.hparams.get("hidden_size")) is not None:
88-
self.gguf_writer.add_embedding_length(n_embd)
89-
if (n_ff := self.hparams.get("intermediate_size")) is not None:
86+
87+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
88+
self.gguf_writer.add_embedding_length(n_embd)
89+
90+
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
9091
self.gguf_writer.add_feed_forward_length(n_ff)
91-
if (n_head := self.hparams.get("num_attention_heads")) is not None:
92-
self.gguf_writer.add_head_count(n_head)
92+
93+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
94+
self.gguf_writer.add_head_count(n_head)
95+
9396
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
9497
self.gguf_writer.add_head_count_kv(n_head_kv)
9598

96-
if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
97-
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
99+
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
100+
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
101+
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
102+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
98103
if (n_experts := self.hparams.get("num_local_experts")) is not None:
99104
self.gguf_writer.add_expert_count(n_experts)
100105
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
101106
self.gguf_writer.add_expert_used_count(n_experts_used)
102107

103-
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
108+
self.gguf_writer.add_file_type(self.ftype)
104109

105110
def write_tensors(self):
106111
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -211,6 +216,8 @@ def from_model_architecture(model_architecture):
211216
return MiniCPMModel
212217
if model_architecture == "BertModel":
213218
return BertModel
219+
if model_architecture == "NomicBertModel":
220+
return NomicBertModel
214221
return Model
215222

216223
def _is_model_safetensors(self) -> bool:
@@ -268,6 +275,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
268275
return gguf.MODEL_ARCH.MINICPM
269276
if arch == "BertModel":
270277
return gguf.MODEL_ARCH.BERT
278+
if arch == "NomicBertModel":
279+
return gguf.MODEL_ARCH.NOMIC_BERT
271280

272281
raise NotImplementedError(f'Architecture "{arch}" not supported!')
273282

@@ -1297,21 +1306,21 @@ def write_tensors(self):
12971306

12981307
class Phi2Model(Model):
12991308
def set_gguf_parameters(self):
1300-
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
1309+
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
13011310

1302-
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
1303-
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
1304-
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
1311+
rot_pct = self.find_hparam(["partial_rotary_factor"])
1312+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
1313+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
13051314

13061315
self.gguf_writer.add_name("Phi2")
1307-
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
1316+
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
13081317

13091318
self.gguf_writer.add_embedding_length(n_embd)
13101319
self.gguf_writer.add_feed_forward_length(4 * n_embd)
13111320
self.gguf_writer.add_block_count(block_count)
13121321
self.gguf_writer.add_head_count(n_head)
13131322
self.gguf_writer.add_head_count_kv(n_head)
1314-
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
1323+
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
13151324
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
13161325
self.gguf_writer.add_file_type(self.ftype)
13171326
self.gguf_writer.add_add_bos_token(False)
@@ -1636,20 +1645,12 @@ def write_tensors(self):
16361645
class BertModel(Model):
16371646
def __init__(self, *args, **kwargs):
16381647
super().__init__(*args, **kwargs)
1639-
self.block_count = self.hparams["num_hidden_layers"]
1648+
self.vocab_size = None
16401649

16411650
def set_gguf_parameters(self):
1642-
# TODO(cebtenzzre): merge with parent class
1643-
self.gguf_writer.add_name(self.dir_model.name)
1644-
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1645-
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1646-
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1647-
self.gguf_writer.add_block_count(self.block_count)
1648-
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
1649-
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
1651+
super().set_gguf_parameters()
16501652
self.gguf_writer.add_causal_attention(False)
16511653
self.gguf_writer.add_pooling_layer(True)
1652-
self.gguf_writer.add_file_type(self.ftype)
16531654

16541655
def set_vocab(self):
16551656
path = self.dir_model
@@ -1659,6 +1660,7 @@ def set_vocab(self):
16591660
vocab = HfVocab(path, added_tokens_path)
16601661
tokens, scores, toktypes = zip(*vocab.all_tokens())
16611662
assert len(tokens) == vocab.vocab_size
1663+
self.vocab_size = vocab.vocab_size
16621664

16631665
# we need this to validate the size of the token_type embeddings
16641666
# though currently we are passing all zeros to the token_type embeddings
@@ -1672,7 +1674,7 @@ def phantom(tok, typ):
16721674
if tok.startswith(b"##"):
16731675
return tok[2:]
16741676
return b"\xe2\x96\x81" + tok
1675-
tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)]
1677+
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
16761678

16771679
# set up bos and eos tokens (cls and sep)
16781680
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
@@ -1724,6 +1726,43 @@ def write_tensors(self):
17241726
self.gguf_writer.add_tensor(new_name, data)
17251727

17261728

1729+
class NomicBertModel(BertModel):
1730+
def __init__(self, *args, **kwargs):
1731+
super().__init__(*args, **kwargs)
1732+
1733+
# the HF config claims n_ctx=8192, but it uses RoPE scaling
1734+
self.hparams["n_ctx"] = 2048
1735+
1736+
# SwigLU activation
1737+
assert self.hparams["activation_function"] == "swiglu"
1738+
# this doesn't do anything in the HF version
1739+
assert self.hparams["causal"] is False
1740+
# no bias tensors
1741+
assert self.hparams["qkv_proj_bias"] is False
1742+
assert self.hparams["mlp_fc1_bias"] is False
1743+
assert self.hparams["mlp_fc2_bias"] is False
1744+
# norm at end of layer
1745+
assert self.hparams["prenorm"] is False
1746+
# standard RoPE
1747+
assert self.hparams["rotary_emb_fraction"] == 1.0
1748+
assert self.hparams["rotary_emb_interleaved"] is False
1749+
assert self.hparams["rotary_emb_scale_base"] is None
1750+
1751+
def set_gguf_parameters(self):
1752+
super().set_gguf_parameters()
1753+
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
1754+
1755+
def get_tensors(self):
1756+
assert self.vocab_size is not None
1757+
for name, data in super().get_tensors():
1758+
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
1759+
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
1760+
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
1761+
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
1762+
data = data[:self.vocab_size, :]
1763+
yield name, data
1764+
1765+
17271766
###### CONVERSION LOGIC ######
17281767

17291768

gguf-py/gguf/constants.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,27 +87,28 @@ class Tokenizer:
8787

8888

8989
class MODEL_ARCH(IntEnum):
90-
LLAMA = auto()
91-
FALCON = auto()
92-
BAICHUAN = auto()
93-
GPT2 = auto()
94-
GPTJ = auto()
95-
GPTNEOX = auto()
96-
MPT = auto()
97-
STARCODER = auto()
98-
PERSIMMON = auto()
99-
REFACT = auto()
100-
BERT = auto()
101-
BLOOM = auto()
102-
STABLELM = auto()
103-
QWEN = auto()
104-
QWEN2 = auto()
105-
PHI2 = auto()
106-
PLAMO = auto()
107-
CODESHELL = auto()
108-
ORION = auto()
90+
LLAMA = auto()
91+
FALCON = auto()
92+
BAICHUAN = auto()
93+
GPT2 = auto()
94+
GPTJ = auto()
95+
GPTNEOX = auto()
96+
MPT = auto()
97+
STARCODER = auto()
98+
PERSIMMON = auto()
99+
REFACT = auto()
100+
BERT = auto()
101+
NOMIC_BERT = auto()
102+
BLOOM = auto()
103+
STABLELM = auto()
104+
QWEN = auto()
105+
QWEN2 = auto()
106+
PHI2 = auto()
107+
PLAMO = auto()
108+
CODESHELL = auto()
109+
ORION = auto()
109110
INTERNLM2 = auto()
110-
MINICPM = auto()
111+
MINICPM = auto()
111112

112113

113114
class MODEL_TENSOR(IntEnum):
@@ -153,6 +154,7 @@ class MODEL_TENSOR(IntEnum):
153154
MODEL_ARCH.PERSIMMON: "persimmon",
154155
MODEL_ARCH.REFACT: "refact",
155156
MODEL_ARCH.BERT: "bert",
157+
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
156158
MODEL_ARCH.BLOOM: "bloom",
157159
MODEL_ARCH.STABLELM: "stablelm",
158160
MODEL_ARCH.QWEN: "qwen",
@@ -282,6 +284,20 @@ class MODEL_TENSOR(IntEnum):
282284
MODEL_TENSOR.FFN_UP,
283285
MODEL_TENSOR.LAYER_OUT_NORM,
284286
],
287+
MODEL_ARCH.NOMIC_BERT: [
288+
MODEL_TENSOR.TOKEN_EMBD,
289+
MODEL_TENSOR.TOKEN_EMBD_NORM,
290+
MODEL_TENSOR.TOKEN_TYPES,
291+
MODEL_TENSOR.POS_EMBD,
292+
MODEL_TENSOR.OUTPUT_NORM,
293+
MODEL_TENSOR.ATTN_OUT_NORM,
294+
MODEL_TENSOR.ATTN_QKV,
295+
MODEL_TENSOR.ATTN_OUT,
296+
MODEL_TENSOR.FFN_GATE,
297+
MODEL_TENSOR.FFN_DOWN,
298+
MODEL_TENSOR.FFN_UP,
299+
MODEL_TENSOR.LAYER_OUT_NORM,
300+
],
285301
MODEL_ARCH.MPT: [
286302
MODEL_TENSOR.TOKEN_EMBD,
287303
MODEL_TENSOR.OUTPUT_NORM,

gguf-py/gguf/tensor_mapping.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class TensorNameMap:
1515
"word_embeddings", # bloom
1616
"model.embed_tokens", # llama-hf
1717
"tok_embeddings", # llama-pth
18-
"embeddings.word_embeddings", # bert
18+
"embeddings.word_embeddings", # bert nomic-bert
1919
"language_model.embedding.word_embeddings", # persimmon
2020
"wte", # gpt2
2121
"transformer.embd.wte", # phi2
@@ -24,13 +24,14 @@ class TensorNameMap:
2424

2525
# Token type embeddings
2626
MODEL_TENSOR.TOKEN_TYPES: (
27-
"embeddings.token_type_embeddings", # bert
27+
"embeddings.token_type_embeddings", # bert nomic-bert
2828
),
2929

3030
# Normalization of token embeddings
3131
MODEL_TENSOR.TOKEN_EMBD_NORM: (
3232
"word_embeddings_layernorm", # bloom
3333
"embeddings.LayerNorm", # bert
34+
"emb_ln", # nomic-bert
3435
),
3536

3637
# Position embeddings
@@ -103,6 +104,7 @@ class TensorNameMap:
103104
"model.layers.{bid}.self_attn.query_key_value", # persimmon
104105
"h.{bid}.attn.c_attn", # gpt2
105106
"transformer.h.{bid}.mixer.Wqkv", # phi2
107+
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
106108
),
107109

108110
# Attention query
@@ -152,11 +154,13 @@ class TensorNameMap:
152154
"transformer.h.{bid}.mixer.out_proj", # phi2
153155
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
154156
"model.layers.{bid}.attention.wo", # internlm2
157+
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
155158
),
156159

157160
# Attention output norm
158161
MODEL_TENSOR.ATTN_OUT_NORM: (
159162
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
163+
"encoder.layers.{bid}.norm1", # nomic-bert
160164
),
161165

162166
# Rotary embeddings
@@ -205,6 +209,7 @@ class TensorNameMap:
205209
"model.layers.{bid}.mlp.fc1", # phi2
206210
"model.layers.layers.{bid}.mlp.up_proj", # plamo
207211
"model.layers.{bid}.feed_forward.w3", # internlm2
212+
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
208213
),
209214

210215
MODEL_TENSOR.FFN_UP_EXP: (
@@ -224,6 +229,7 @@ class TensorNameMap:
224229
"transformer.h.{bid}.mlp.w2", # qwen
225230
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
226231
"model.layers.{bid}.feed_forward.w1", # internlm2
232+
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
227233
),
228234

229235
MODEL_TENSOR.FFN_GATE_EXP: (
@@ -249,6 +255,7 @@ class TensorNameMap:
249255
"model.layers.{bid}.mlp.fc2", # phi2
250256
"model.layers.layers.{bid}.mlp.down_proj", # plamo
251257
"model.layers.{bid}.feed_forward.w2", # internlm2
258+
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
252259
),
253260

254261
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -272,6 +279,7 @@ class TensorNameMap:
272279

273280
MODEL_TENSOR.LAYER_OUT_NORM: (
274281
"encoder.layer.{bid}.output.LayerNorm", # bert
282+
"encoder.layers.{bid}.norm2", # nomic-bert
275283
)
276284
}
277285

0 commit comments

Comments
 (0)