Skip to content

Commit 0d49878

Browse files
committed
Add nemotron GGUF conversion & inference support
1 parent 15fa07a commit 0d49878

File tree

4 files changed

+285
-11
lines changed

4 files changed

+285
-11
lines changed

convert_hf_to_gguf.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3604,6 +3604,58 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
36043604
name = name.removeprefix("transformer.")
36053605
return [(self.map_tensor_name(name), data_torch)]
36063606

3607+
@Model.register("NemotronForCausalLM")
3608+
class NemotronModel(Model):
3609+
model_arch = gguf.MODEL_ARCH.NEMOTRON
3610+
3611+
def set_vocab(self):
3612+
self. _set_vocab_sentencepiece()
3613+
self.gguf_writer.add_pad_token_id(0)
3614+
self.gguf_writer.add_unk_token_id(1)
3615+
3616+
def set_gguf_parameters(self):
3617+
super().set_gguf_parameters()
3618+
hparams = self.hparams
3619+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3620+
3621+
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
3622+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
3623+
3624+
# * Partial RoPE
3625+
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
3626+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
3627+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
3628+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
3629+
3630+
# * RopeScaling for Nemotron
3631+
if self.hparams["rope_scaling"] is None:
3632+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3633+
else:
3634+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3635+
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
3636+
3637+
_experts: list[dict[str, Tensor]] | None = None
3638+
3639+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3640+
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
3641+
# model.layers.{l}.input_layernorm.weight
3642+
# model.layers.{l}.post_attention_layernorm.weight
3643+
# model.norm.weight
3644+
if name.endswith("norm.weight"):
3645+
data_torch = data_torch + 1
3646+
3647+
return [(self.map_tensor_name(name), data_torch)]
3648+
3649+
def write_tensors(self):
3650+
super().write_tensors()
3651+
3652+
if self._experts is not None:
3653+
# flatten `list[dict[str, Tensor]]` into `list[str]`
3654+
experts = [k for d in self._experts for k in d.keys()]
3655+
if len(experts) > 0:
3656+
raise ValueError(f"Unprocessed experts: {experts}")
3657+
3658+
36073659
###### CONVERSION LOGIC ######
36083660

36093661

gguf-py/gguf/constants.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ class MODEL_ARCH(IntEnum):
218218
BITNET = auto()
219219
T5 = auto()
220220
JAIS = auto()
221+
NEMOTRON = auto()
221222

222223

223224
class MODEL_TENSOR(IntEnum):
@@ -345,6 +346,7 @@ class MODEL_TENSOR(IntEnum):
345346
MODEL_ARCH.BITNET: "bitnet",
346347
MODEL_ARCH.T5: "t5",
347348
MODEL_ARCH.JAIS: "jais",
349+
MODEL_ARCH.NEMOTRON: "nemotron",
348350
}
349351

350352
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1048,6 +1050,21 @@ class MODEL_TENSOR(IntEnum):
10481050
MODEL_TENSOR.FFN_GATE,
10491051
MODEL_TENSOR.FFN_UP,
10501052
],
1053+
MODEL_ARCH.NEMOTRON: [
1054+
MODEL_TENSOR.TOKEN_EMBD,
1055+
MODEL_TENSOR.OUTPUT_NORM,
1056+
MODEL_TENSOR.OUTPUT,
1057+
MODEL_TENSOR.ROPE_FREQS,
1058+
MODEL_TENSOR.ATTN_NORM,
1059+
MODEL_TENSOR.ATTN_Q,
1060+
MODEL_TENSOR.ATTN_K,
1061+
MODEL_TENSOR.ATTN_V,
1062+
MODEL_TENSOR.ATTN_OUT,
1063+
MODEL_TENSOR.ATTN_ROT_EMBD,
1064+
MODEL_TENSOR.FFN_NORM,
1065+
MODEL_TENSOR.FFN_DOWN,
1066+
MODEL_TENSOR.FFN_UP,
1067+
],
10511068
# TODO
10521069
}
10531070

@@ -1088,6 +1105,10 @@ class MODEL_TENSOR(IntEnum):
10881105
MODEL_ARCH.CHATGLM: [
10891106
MODEL_TENSOR.ROPE_FREQS,
10901107
],
1108+
MODEL_ARCH.NEMOTRON: [
1109+
MODEL_TENSOR.ROPE_FREQS,
1110+
MODEL_TENSOR.ATTN_ROT_EMBD,
1111+
],
10911112
}
10921113

10931114
#

gguf-py/gguf/tensor_mapping.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class TensorNameMap:
1313
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
1414
"transformer.word_embeddings", # falcon
1515
"word_embeddings", # bloom
16-
"model.embed_tokens", # llama-hf
16+
"model.embed_tokens", # llama-hf nemotron
1717
"tok_embeddings", # llama-pth
1818
"embeddings.word_embeddings", # bert nomic-bert
1919
"language_model.embedding.word_embeddings", # persimmon
@@ -52,7 +52,7 @@ class TensorNameMap:
5252
# Output
5353
MODEL_TENSOR.OUTPUT: (
5454
"embed_out", # gptneox
55-
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
55+
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron
5656
"output", # llama-pth bloom internlm2
5757
"word_embeddings_for_head", # persimmon
5858
"lm_head.linear", # phi2
@@ -75,6 +75,7 @@ class TensorNameMap:
7575
"transformer.rms_norm", # Grok
7676
"encoder.final_layernorm", # chatglm
7777
"transformer.norm", # openelm
78+
"model.norm", # nemotron
7879
),
7980

8081
# Rope frequencies
@@ -93,7 +94,7 @@ class TensorNameMap:
9394
"transformer.h.{bid}.input_layernorm", # falcon7b
9495
"h.{bid}.input_layernorm", # bloom
9596
"transformer.h.{bid}.ln_mlp", # falcon40b
96-
"model.layers.{bid}.input_layernorm", # llama-hf
97+
"model.layers.{bid}.input_layernorm", # llama-hf nemotron
9798
"layers.{bid}.attention_norm", # llama-pth
9899
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
99100
"model.layers.{bid}.ln1", # yi
@@ -135,7 +136,7 @@ class TensorNameMap:
135136

136137
# Attention query
137138
MODEL_TENSOR.ATTN_Q: (
138-
"model.layers.{bid}.self_attn.q_proj", # llama-hf
139+
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron
139140
"layers.{bid}.attention.wq", # llama-pth
140141
"encoder.layer.{bid}.attention.self.query", # bert
141142
"transformer.h.{bid}.attn.q_proj", # gpt-j
@@ -146,7 +147,7 @@ class TensorNameMap:
146147

147148
# Attention key
148149
MODEL_TENSOR.ATTN_K: (
149-
"model.layers.{bid}.self_attn.k_proj", # llama-hf
150+
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron
150151
"layers.{bid}.attention.wk", # llama-pth
151152
"encoder.layer.{bid}.attention.self.key", # bert
152153
"transformer.h.{bid}.attn.k_proj", # gpt-j
@@ -158,7 +159,7 @@ class TensorNameMap:
158159

159160
# Attention value
160161
MODEL_TENSOR.ATTN_V: (
161-
"model.layers.{bid}.self_attn.v_proj", # llama-hf
162+
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron
162163
"layers.{bid}.attention.wv", # llama-pth
163164
"encoder.layer.{bid}.attention.self.value", # bert
164165
"transformer.h.{bid}.attn.v_proj", # gpt-j
@@ -175,7 +176,7 @@ class TensorNameMap:
175176
"transformer.blocks.{bid}.attn.out_proj", # mpt
176177
"transformer.h.{bid}.self_attention.dense", # falcon
177178
"h.{bid}.self_attention.dense", # bloom
178-
"model.layers.{bid}.self_attn.o_proj", # llama-hf
179+
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron
179180
"layers.{bid}.attention.wo", # llama-pth
180181
"encoder.layer.{bid}.attention.output.dense", # bert
181182
"transformer.h.{bid}.attn.out_proj", # gpt-j
@@ -218,7 +219,7 @@ class TensorNameMap:
218219
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
219220
"h.{bid}.post_attention_layernorm", # bloom
220221
"transformer.blocks.{bid}.norm_2", # mpt
221-
"model.layers.{bid}.post_attention_layernorm", # llama-hf
222+
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron
222223
"layers.{bid}.ffn_norm", # llama-pth
223224
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
224225
"model.layers.{bid}.ln2", # yi
@@ -258,7 +259,7 @@ class TensorNameMap:
258259
"transformer.blocks.{bid}.ffn.up_proj", # mpt
259260
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
260261
"h.{bid}.mlp.dense_h_to_4h", # bloom
261-
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
262+
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron
262263
"layers.{bid}.feed_forward.w3", # llama-pth
263264
"encoder.layer.{bid}.intermediate.dense", # bert
264265
"transformer.h.{bid}.mlp.fc_in", # gpt-j
@@ -329,7 +330,7 @@ class TensorNameMap:
329330
"transformer.blocks.{bid}.ffn.down_proj", # mpt
330331
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
331332
"h.{bid}.mlp.dense_4h_to_h", # bloom
332-
"model.layers.{bid}.mlp.down_proj", # llama-hf
333+
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron
333334
"layers.{bid}.feed_forward.w2", # llama-pth
334335
"encoder.layer.{bid}.output.dense", # bert
335336
"transformer.h.{bid}.mlp.fc_out", # gpt-j

0 commit comments

Comments
 (0)