Skip to content

Commit 6fcd133

Browse files
authored
llama : more checks before assuming FIM tokens (#7644)
* More checks before assuming FIM tokens for Llama arch * extensive token check
1 parent 41b9260 commit 6fcd133

File tree

1 file changed

+39
-29
lines changed

1 file changed

+39
-29
lines changed

llama.cpp

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4561,35 +4561,6 @@ static void llm_load_vocab(
45614561
vocab.special_cls_id = -1;
45624562
vocab.special_mask_id = -1;
45634563

4564-
// For Fill-In-the-Middle (FIM)/infill models which where converted
4565-
// prior to support of FIM special tokens in GGUF, the following
4566-
// will allow those models to continue to work. The general names
4567-
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4568-
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4569-
// new versions of these models have been published.
4570-
std::string gen_name;
4571-
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4572-
4573-
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4574-
[](unsigned char c){ return std::tolower(c); });
4575-
4576-
if (gen_name.find("code") != std::string::npos) {
4577-
if (model.arch == LLM_ARCH_LLAMA) {
4578-
vocab.special_prefix_id = 32007;
4579-
vocab.special_suffix_id = 32008;
4580-
vocab.special_middle_id = 32009;
4581-
vocab.special_eot_id = 32010;
4582-
} else if (model.arch == LLM_ARCH_GEMMA) {
4583-
vocab.special_prefix_id = 67;
4584-
vocab.special_suffix_id = 69;
4585-
vocab.special_middle_id = 68;
4586-
// TODO: this is not EOT, it is "file separator" token, needs fix
4587-
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4588-
//vocab.special_eot_id = 70;
4589-
vocab.special_eot_id = 107;
4590-
}
4591-
}
4592-
45934564
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
45944565
if (add_space_prefix_keyidx != -1) {
45954566
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4773,6 +4744,45 @@ static void llm_load_vocab(
47734744

47744745
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
47754746
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4747+
// For Fill-In-the-Middle (FIM)/infill models which where converted
4748+
// prior to support of FIM special tokens in GGUF, the following
4749+
// will allow those models to continue to work. The general names
4750+
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4751+
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4752+
// new versions of these models have been published.
4753+
std::string gen_name;
4754+
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
4755+
4756+
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4757+
[](unsigned char c){ return std::tolower(c); });
4758+
4759+
if (gen_name.find("code") != std::string::npos) {
4760+
if (model.arch == LLM_ARCH_LLAMA
4761+
&& 32010 < vocab.id_to_token.size()
4762+
&& vocab.id_to_token[32007].text == "<PRE>"
4763+
&& vocab.id_to_token[32008].text == "<SUF>"
4764+
&& vocab.id_to_token[32009].text == "<MID>"
4765+
&& vocab.id_to_token[32010].text == "<EOT>") {
4766+
vocab.special_prefix_id = 32007;
4767+
vocab.special_suffix_id = 32008;
4768+
vocab.special_middle_id = 32009;
4769+
vocab.special_eot_id = 32010;
4770+
} else if (model.arch == LLM_ARCH_GEMMA
4771+
&& 107 < vocab.id_to_token.size()
4772+
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
4773+
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
4774+
&& vocab.id_to_token[68].text == "<|fim_middle|>"
4775+
&& vocab.id_to_token[107].text == "<end_of_turn>") {
4776+
vocab.special_prefix_id = 67;
4777+
vocab.special_suffix_id = 69;
4778+
vocab.special_middle_id = 68;
4779+
// TODO: this is not EOT, it is "file separator" token, needs fix
4780+
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
4781+
//vocab.special_eot_id = 70;
4782+
vocab.special_eot_id = 107;
4783+
}
4784+
}
4785+
47764786
try {
47774787
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
47784788
} catch (const std::exception & e) {

0 commit comments

Comments
 (0)