Skip to content

Commit c0caa4c

Browse files
akxjordankanter
authored andcommitted
llama : allow raw byte in SPM vocabs; don't crash on nl 404 (ggml-org#5478)
* common : don't crash if newline token is not found * common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs
1 parent f3cfa6b commit c0caa4c

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

llama.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3314,7 +3314,12 @@ static void llm_load_vocab(
33143314

33153315
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
33163316
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3317-
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3317+
try {
3318+
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
3319+
} catch (const std::exception & e) {
3320+
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
3321+
vocab.linefeed_id = vocab.special_pad_id;
3322+
}
33183323
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
33193324
vocab.linefeed_id = vocab.special_pad_id;
33203325
} else {
@@ -7746,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
77467751
switch (llama_vocab_get_type(vocab)) {
77477752
case LLAMA_VOCAB_TYPE_SPM: {
77487753
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7749-
return vocab.token_to_id.at(buf);
7754+
auto token = vocab.token_to_id.find(buf);
7755+
if (token != vocab.token_to_id.end()) {
7756+
return (*token).second;
7757+
}
7758+
// Try to fall back to just the byte as a string
7759+
const char buf2[2] = { (char)ch, 0 };
7760+
return vocab.token_to_id.at(buf2);
77507761
}
77517762
case LLAMA_VOCAB_TYPE_WPM:
77527763
case LLAMA_VOCAB_TYPE_BPE: {

0 commit comments

Comments
 (0)