Skip to content

Commit 2a24c8c

Browse files
suharacompilade
andauthored
Add Nemotron/Minitron GGUF Conversion & Inference Support (#8922)
* Add nemotron GGUF conversion & inference support * Fix formatting issues * Remove unnecessary write_tensors() * Update convert_hf_to_gguf.py Co-authored-by: compilade <[email protected]> * Update src/llama.cpp Co-authored-by: compilade <[email protected]> * Address comments by @compilade * Replace ggml_mul_mat()->llm_build_lora_mm() * Remove mutable variable * Use for bias tensors * Cover corner case for role_scaling not in config.json --------- Co-authored-by: compilade <[email protected]>
1 parent e3f6fd5 commit 2a24c8c

File tree

4 files changed

+271
-11
lines changed

4 files changed

+271
-11
lines changed

convert_hf_to_gguf.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3740,6 +3740,47 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
37403740
name = name.removeprefix("transformer.")
37413741
return [(self.map_tensor_name(name), data_torch)]
37423742

3743+
3744+
@Model.register("NemotronForCausalLM")
3745+
class NemotronModel(Model):
3746+
model_arch = gguf.MODEL_ARCH.NEMOTRON
3747+
3748+
def set_vocab(self):
3749+
self._set_vocab_sentencepiece()
3750+
self.gguf_writer.add_pad_token_id(0)
3751+
self.gguf_writer.add_unk_token_id(1)
3752+
3753+
def set_gguf_parameters(self):
3754+
super().set_gguf_parameters()
3755+
hparams = self.hparams
3756+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3757+
3758+
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
3759+
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
3760+
3761+
# * Partial RoPE
3762+
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
3763+
n_embd = self.find_hparam(["hidden_size", "n_embd"])
3764+
n_head = self.find_hparam(["num_attention_heads", "n_head"])
3765+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
3766+
3767+
# * RopeScaling for Nemotron
3768+
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
3769+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3770+
else:
3771+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
3772+
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
3773+
3774+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3775+
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
3776+
# model.layers.{l}.input_layernorm.weight
3777+
# model.layers.{l}.post_attention_layernorm.weight
3778+
# model.norm.weight
3779+
if name.endswith("norm.weight"):
3780+
data_torch = data_torch + 1
3781+
3782+
return [(self.map_tensor_name(name), data_torch)]
3783+
37433784
###### CONVERSION LOGIC ######
37443785

37453786

gguf-py/gguf/constants.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ class MODEL_ARCH(IntEnum):
219219
T5 = auto()
220220
T5ENCODER = auto()
221221
JAIS = auto()
222+
NEMOTRON = auto()
222223

223224

224225
class MODEL_TENSOR(IntEnum):
@@ -347,6 +348,7 @@ class MODEL_TENSOR(IntEnum):
347348
MODEL_ARCH.T5: "t5",
348349
MODEL_ARCH.T5ENCODER: "t5encoder",
349350
MODEL_ARCH.JAIS: "jais",
351+
MODEL_ARCH.NEMOTRON: "nemotron",
350352
}
351353

352354
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1065,6 +1067,21 @@ class MODEL_TENSOR(IntEnum):
10651067
MODEL_TENSOR.FFN_GATE,
10661068
MODEL_TENSOR.FFN_UP,
10671069
],
1070+
MODEL_ARCH.NEMOTRON: [
1071+
MODEL_TENSOR.TOKEN_EMBD,
1072+
MODEL_TENSOR.OUTPUT_NORM,
1073+
MODEL_TENSOR.OUTPUT,
1074+
MODEL_TENSOR.ROPE_FREQS,
1075+
MODEL_TENSOR.ATTN_NORM,
1076+
MODEL_TENSOR.ATTN_Q,
1077+
MODEL_TENSOR.ATTN_K,
1078+
MODEL_TENSOR.ATTN_V,
1079+
MODEL_TENSOR.ATTN_OUT,
1080+
MODEL_TENSOR.ATTN_ROT_EMBD,
1081+
MODEL_TENSOR.FFN_NORM,
1082+
MODEL_TENSOR.FFN_DOWN,
1083+
MODEL_TENSOR.FFN_UP,
1084+
],
10681085
# TODO
10691086
}
10701087

@@ -1105,6 +1122,10 @@ class MODEL_TENSOR(IntEnum):
11051122
MODEL_ARCH.CHATGLM: [
11061123
MODEL_TENSOR.ROPE_FREQS,
11071124
],
1125+
MODEL_ARCH.NEMOTRON: [
1126+
MODEL_TENSOR.ROPE_FREQS,
1127+
MODEL_TENSOR.ATTN_ROT_EMBD,
1128+
],
11081129
}
11091130

11101131
#

gguf-py/gguf/tensor_mapping.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class TensorNameMap:
1313
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
1414
"transformer.word_embeddings", # falcon
1515
"word_embeddings", # bloom
16-
"model.embed_tokens", # llama-hf
16+
"model.embed_tokens", # llama-hf nemotron
1717
"tok_embeddings", # llama-pth
1818
"embeddings.word_embeddings", # bert nomic-bert
1919
"language_model.embedding.word_embeddings", # persimmon
@@ -52,7 +52,7 @@ class TensorNameMap:
5252
# Output
5353
MODEL_TENSOR.OUTPUT: (
5454
"embed_out", # gptneox
55-
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
55+
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron
5656
"output", # llama-pth bloom internlm2
5757
"word_embeddings_for_head", # persimmon
5858
"lm_head.linear", # phi2
@@ -75,6 +75,7 @@ class TensorNameMap:
7575
"transformer.rms_norm", # Grok
7676
"encoder.final_layernorm", # chatglm
7777
"transformer.norm", # openelm
78+
"model.norm", # nemotron
7879
),
7980

8081
# Rope frequencies
@@ -93,7 +94,7 @@ class TensorNameMap:
9394
"transformer.h.{bid}.input_layernorm", # falcon7b
9495
"h.{bid}.input_layernorm", # bloom
9596
"transformer.h.{bid}.ln_mlp", # falcon40b
96-
"model.layers.{bid}.input_layernorm", # llama-hf
97+
"model.layers.{bid}.input_layernorm", # llama-hf nemotron
9798
"layers.{bid}.attention_norm", # llama-pth
9899
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
99100
"model.layers.{bid}.ln1", # yi
@@ -135,7 +136,7 @@ class TensorNameMap:
135136

136137
# Attention query
137138
MODEL_TENSOR.ATTN_Q: (
138-
"model.layers.{bid}.self_attn.q_proj", # llama-hf
139+
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron
139140
"layers.{bid}.attention.wq", # llama-pth
140141
"encoder.layer.{bid}.attention.self.query", # bert
141142
"transformer.h.{bid}.attn.q_proj", # gpt-j
@@ -146,7 +147,7 @@ class TensorNameMap:
146147

147148
# Attention key
148149
MODEL_TENSOR.ATTN_K: (
149-
"model.layers.{bid}.self_attn.k_proj", # llama-hf
150+
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron
150151
"layers.{bid}.attention.wk", # llama-pth
151152
"encoder.layer.{bid}.attention.self.key", # bert
152153
"transformer.h.{bid}.attn.k_proj", # gpt-j
@@ -158,7 +159,7 @@ class TensorNameMap:
158159

159160
# Attention value
160161
MODEL_TENSOR.ATTN_V: (
161-
"model.layers.{bid}.self_attn.v_proj", # llama-hf
162+
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron
162163
"layers.{bid}.attention.wv", # llama-pth
163164
"encoder.layer.{bid}.attention.self.value", # bert
164165
"transformer.h.{bid}.attn.v_proj", # gpt-j
@@ -175,7 +176,7 @@ class TensorNameMap:
175176
"transformer.blocks.{bid}.attn.out_proj", # mpt
176177
"transformer.h.{bid}.self_attention.dense", # falcon
177178
"h.{bid}.self_attention.dense", # bloom
178-
"model.layers.{bid}.self_attn.o_proj", # llama-hf
179+
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron
179180
"layers.{bid}.attention.wo", # llama-pth
180181
"encoder.layer.{bid}.attention.output.dense", # bert
181182
"transformer.h.{bid}.attn.out_proj", # gpt-j
@@ -218,7 +219,7 @@ class TensorNameMap:
218219
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
219220
"h.{bid}.post_attention_layernorm", # bloom
220221
"transformer.blocks.{bid}.norm_2", # mpt
221-
"model.layers.{bid}.post_attention_layernorm", # llama-hf
222+
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron
222223
"layers.{bid}.ffn_norm", # llama-pth
223224
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
224225
"model.layers.{bid}.ln2", # yi
@@ -258,7 +259,7 @@ class TensorNameMap:
258259
"transformer.blocks.{bid}.ffn.up_proj", # mpt
259260
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
260261
"h.{bid}.mlp.dense_h_to_4h", # bloom
261-
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
262+
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron
262263
"layers.{bid}.feed_forward.w3", # llama-pth
263264
"encoder.layer.{bid}.intermediate.dense", # bert
264265
"transformer.h.{bid}.mlp.fc_in", # gpt-j
@@ -329,7 +330,7 @@ class TensorNameMap:
329330
"transformer.blocks.{bid}.ffn.down_proj", # mpt
330331
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
331332
"h.{bid}.mlp.dense_4h_to_h", # bloom
332-
"model.layers.{bid}.mlp.down_proj", # llama-hf
333+
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron
333334
"layers.{bid}.feed_forward.w2", # llama-pth
334335
"encoder.layer.{bid}.output.dense", # bert
335336
"transformer.h.{bid}.mlp.fc_out", # gpt-j

0 commit comments

Comments
 (0)