Skip to content

Llama-3_1-Nemotron-Ultra-253B-v1 support #12843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
May 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ecad966
conflict resolution
ymcki Dec 19, 2024
12aded6
Merge branch 'ggerganov:master' into master
ymcki Dec 22, 2024
643e5e8
move comments after bracket to its own line
ymcki Dec 22, 2024
e68c76d
Merge branch 'ggerganov:master' into master
ymcki Dec 22, 2024
6a4805f
Merge branch 'ggerganov:master' into master
ymcki Dec 22, 2024
f9a1cdb
Merge branch 'ggerganov:master' into master
ymcki Dec 28, 2024
c1736f3
Merge branch 'ggerganov:master' into master
ymcki Dec 29, 2024
984ffac
DeciLMCausalModel now reads rope_theta from config.json properly
ymcki Dec 29, 2024
909a7d9
Merge branch 'ggerganov:master' into master
ymcki Jan 9, 2025
4dad248
Merge branch 'ggml-org:master' into master
ymcki Apr 9, 2025
cc615bc
Merge branch 'ggml-org:master' into master
ymcki Apr 9, 2025
0ac08b5
Llama-3_1-Nemotron-Ultra-253B-v1 support
ymcki Apr 9, 2025
80af2e3
Merge branch 'master' of github.com:ymcki/llama.cpp
ymcki Apr 9, 2025
2a260da
Nemotron 253B support skip ffn computaton when n_head == 0 && n_ff == 0
ymcki Apr 9, 2025
f8f6767
Merge branch 'ggml-org:master' into master
ymcki Apr 9, 2025
1600dfb
Merge branch 'master' of github.com:ymcki/llama.cpp
ymcki Apr 9, 2025
bd3d42a
Merge branch 'ggml-org:master' into master
ymcki Apr 11, 2025
3961bff
Merge branch 'ggml-org:master' into master
ymcki Apr 13, 2025
a4d654f
Merge branch 'ggml-org:master' into master
ymcki Apr 14, 2025
df5e3de
Merge branch 'ggml-org:master' into master
ymcki Apr 16, 2025
da6d8ba
Merge branch 'ggml-org:master' into master
ymcki Apr 16, 2025
5c2fffc
Merge branch 'ggml-org:master' into master
ymcki Apr 22, 2025
9263e02
Merge branch 'ggml-org:master' into master
ymcki Apr 28, 2025
4eebf0a
Merge branch 'ggml-org:master' into master
ymcki May 2, 2025
354416e
add bracket for if in build_deci
ymcki May 2, 2025
df0cc92
add comment b4 if in build_deci
ymcki May 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2089,6 +2089,9 @@ def __init__(self, *args, **kwargs):
# if n_heads_in_group is not None, then
# _num_kv_heads[il] is num_attention_head // n_heads_in_group and
# _num_heads[il] is num_attention_head
# ***dummy layer*** for nemotron 253B
# if n_heads_in_group is None and ffn_mult is None
# then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
for il in range(len(_block_configs)):
if _block_configs[il]["attention"]["n_heads_in_group"] is None:
if _block_configs[il]["attention"]["replace_with_linear"] is True:
Expand All @@ -2100,7 +2103,10 @@ def __init__(self, *args, **kwargs):
else:
self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
self._num_heads.append(self.hparams["num_attention_heads"])
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
_ffn_multipliers.append(0.0)
else:
_ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
assert self.block_count == len(self._num_kv_heads)
assert self.block_count == len(self._num_heads)
assert self.block_count == len(_ffn_multipliers)
Expand Down
20 changes: 16 additions & 4 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_236B: return "236B";
case LLM_TYPE_290B: return "290B";
case LLM_TYPE_314B: return "314B";
case LLM_TYPE_405B: return "405B";
case LLM_TYPE_671B: return "671B";
case LLM_TYPE_SMALL: return "0.1B";
case LLM_TYPE_MEDIUM: return "0.4B";
Expand Down Expand Up @@ -582,6 +583,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_7B; break;
case 80: type = LLM_TYPE_70B; break;
case 162: type = LLM_TYPE_405B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
Expand Down Expand Up @@ -1847,7 +1849,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);

layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
if (n_ff > 0) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
}

if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
Expand All @@ -1857,9 +1861,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
}

layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
if (n_ff > 0) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}

// optional MLP bias
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
Expand Down Expand Up @@ -4691,6 +4697,7 @@ struct llm_build_deci : public llm_graph_context {
ggml_tensor * inpSA = inpL;
const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_head = hparams.n_head(il);
const int64_t n_ff = hparams.n_ff(il);

if (n_head == 0) {
// attention-free layer of Llama-3_1-Nemotron-51B
Expand Down Expand Up @@ -4766,6 +4773,11 @@ struct llm_build_deci : public llm_graph_context {
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}

// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
if (n_head == 0 && n_ff == 0) {
continue;
}

// For Granite architecture
if (hparams.f_residual_scale) {
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
Expand Down
1 change: 1 addition & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_236B,
LLM_TYPE_290B,
LLM_TYPE_314B,
LLM_TYPE_405B,
LLM_TYPE_671B,
LLM_TYPE_SMALL,
LLM_TYPE_MEDIUM,
Expand Down
Loading