Skip to content

Commit 8079eb1

Browse files
committed
support Llama-3_1-Nemotron-51B
1 parent 0669445 commit 8079eb1

File tree

2 files changed

+11
-5
lines changed

2 files changed

+11
-5
lines changed

gguf-py/gguf/vocab.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,7 @@ def _set_special_token(self, typ: str, tid: Any) -> None:
109109
if tid < 0:
110110
raise ValueError(f'invalid value for special token type {typ}: {tid}')
111111
if self.n_vocab is None or tid < self.n_vocab:
112-
if typ in self.special_token_ids:
113-
return
114-
self.special_token_ids[typ] = tid
112+
self.special_token_ids[typ] = tid # allow override
115113
return
116114
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
117115

@@ -188,7 +186,14 @@ def _try_load_from_config_json(self, path: Path) -> bool:
188186
with open(config_file, encoding = 'utf-8') as f:
189187
config = json.load(f)
190188
for typ in self.special_token_types:
191-
self._set_special_token(typ, config.get(f'{typ}_token_id'))
189+
# load eos tokens when it is an array
190+
if typ == 'eos' and isinstance(config.get(f'{typ}_token_id'), list):
191+
eos_ids = config.get(f'{typ}_token_id')
192+
self._set_special_token('eos', eos_ids[0])
193+
self._set_special_token('eom', eos_ids[1])
194+
self._set_special_token('eot', eos_ids[2])
195+
else:
196+
self._set_special_token(typ, config.get(f'{typ}_token_id'))
192197
return True
193198

194199

src/llama.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7680,7 +7680,6 @@ static bool llm_load_tensors(
76807680
}
76817681

76827682
for (int i = 0; i < n_layer; ++i) {
7683-
auto & layer = model.layers[i];
76847683
auto & layer = model.layers[i];
76857684
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
76867685
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
@@ -10806,6 +10805,8 @@ struct llm_build_context {
1080610805
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
1080710806
for (int il = 0; il < n_layer; ++il) {
1080810807
struct ggml_tensor * inpSA = inpL;
10808+
const int64_t n_head_kv = hparams.n_head_kv(il);
10809+
const int64_t n_head = hparams.n_head(il);
1080910810

1081010811
if (n_head == 0) // attention-free layer of Llama-3_1-Nemotron-51B
1081110812
cur = inpL;

0 commit comments

Comments
 (0)