Skip to content

Commit b693000

Browse files
authored
llama.cpp : fix linefeed token
1 parent e2d23be commit b693000

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

llama.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1596,6 +1596,9 @@ static void llm_load_hparams(
15961596
hparams.rope_freq_scale = rope_freq_scale;
15971597
}
15981598

1599+
// TODO: This should probably be in llama.h
1600+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
1601+
15991602
static void llm_load_vocab(
16001603
llama_model_loader & ml,
16011604
llama_model & model) {
@@ -1655,12 +1658,11 @@ static void llm_load_vocab(
16551658
token_data.score = scores[i];
16561659
token_data.type = (llama_token_type) toktypes[i];
16571660

1658-
// determine the newline token: 0x0A == 10 == '\n'
1659-
if (token_data.text == "<0x0A>") {
1660-
vocab.linefeed_id = i;
1661-
}
16621661
}
16631662

1663+
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
1664+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
1665+
16641666
// special tokens
16651667
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
16661668
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));

0 commit comments

Comments
 (0)