@@ -3314,7 +3314,12 @@ static void llm_load_vocab(
3314
3314
3315
3315
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
3316
3316
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
3317
- vocab.linefeed_id = llama_byte_to_token (vocab, ' \n ' );
3317
+ try {
3318
+ vocab.linefeed_id = llama_byte_to_token (vocab, ' \n ' );
3319
+ } catch (const std::exception & e) {
3320
+ LLAMA_LOG_WARN (" %s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead." , __func__, e.what ());
3321
+ vocab.linefeed_id = vocab.special_pad_id ;
3322
+ }
3318
3323
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
3319
3324
vocab.linefeed_id = vocab.special_pad_id ;
3320
3325
} else {
@@ -7746,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
7746
7751
switch (llama_vocab_get_type (vocab)) {
7747
7752
case LLAMA_VOCAB_TYPE_SPM: {
7748
7753
const char buf[7 ] = { ' <' , ' 0' , ' x' , hex[ch >> 4 ], hex[ch & 15 ], ' >' , 0 };
7749
- return vocab.token_to_id .at (buf);
7754
+ auto token = vocab.token_to_id .find (buf);
7755
+ if (token != vocab.token_to_id .end ()) {
7756
+ return (*token).second ;
7757
+ }
7758
+ // Try to fall back to just the byte as a string
7759
+ const char buf2[2 ] = { (char )ch, 0 };
7760
+ return vocab.token_to_id .at (buf2);
7750
7761
}
7751
7762
case LLAMA_VOCAB_TYPE_WPM:
7752
7763
case LLAMA_VOCAB_TYPE_BPE: {
0 commit comments