Skip to content

Commit dce9bc9

Browse files
committed
common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs
1 parent 4ef6e82 commit dce9bc9

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

llama.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7716,7 +7716,12 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
77167716
switch (llama_vocab_get_type(vocab)) {
77177717
case LLAMA_VOCAB_TYPE_SPM: {
77187718
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
7719-
return vocab.token_to_id.at(buf);
7719+
if (vocab.token_to_id.find(buf) != vocab.token_to_id.end()) {
7720+
return vocab.token_to_id.at(buf);
7721+
}
7722+
// Try to fall back to just the byte as a string
7723+
const char buf2[2] = { (char)ch, 0 };
7724+
return vocab.token_to_id.at(buf2);
77207725
}
77217726
case LLAMA_VOCAB_TYPE_WPM:
77227727
case LLAMA_VOCAB_TYPE_BPE: {

0 commit comments

Comments
 (0)