Skip to content

Commit 72b353f

Browse files
committed
common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs
1 parent 93aed75 commit 72b353f

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

llama.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7751,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
77517751
switch (llama_vocab_get_type(vocab)) {
77527752
case LLAMA_VOCAB_TYPE_SPM: {
77537753
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7754-
return vocab.token_to_id.at(buf);
7754+
auto token = vocab.token_to_id.find(buf);
7755+
if (token != vocab.token_to_id.end()) {
7756+
return (*token).second;
7757+
}
7758+
// Try to fall back to just the byte as a string
7759+
const char buf2[2] = { (char)ch, 0 };
7760+
return vocab.token_to_id.at(buf2);
77557761
}
77567762
case LLAMA_VOCAB_TYPE_WPM:
77577763
case LLAMA_VOCAB_TYPE_BPE: {

0 commit comments

Comments
 (0)