@@ -5170,6 +5170,28 @@ static void llm_load_vocab(
5170
5170
vocab.token_to_id[word] = i;
5171
5171
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
5172
5172
5173
+ // TODO: properly handle pre-normalized added_tokens and remove this
5174
+ // handle space tokens with dual tokens,
5175
+ // like the pre-normalized added_tokens
5176
+ // of neox-style tokenizers (mpt, olmo, stablelm, etc)
5177
+ if (word.find(' ') != std::string::npos) {
5178
+ // same as in the internal `unicode_byte_encoding_process`
5179
+ // TODO: extract and expose this in some unicode_* function
5180
+ std::string text_utf;
5181
+ auto utf_word = unicode_cpts_from_utf8(word);
5182
+ for (size_t i = 0; i < utf_word.size(); ++i) {
5183
+ text_utf += unicode_cpt_to_utf8(utf_word[i]);
5184
+ }
5185
+
5186
+ std::string encoded_token;
5187
+ for (char & c : text_utf) {
5188
+ encoded_token += unicode_byte_to_utf8(c);
5189
+ }
5190
+
5191
+ // override token id
5192
+ vocab.token_to_id[encoded_token] = i;
5193
+ }
5194
+
5173
5195
auto & token_data = vocab.id_to_token[i];
5174
5196
token_data.text = std::move(word);
5175
5197
token_data.score = scores ? scores[i] : 0.0f;
@@ -13890,13 +13912,9 @@ struct llm_tokenizer_bpe {
13890
13912
};
13891
13913
break;
13892
13914
case LLAMA_VOCAB_PRE_TYPE_MPT:
13893
- // TODO: MPT pre-tokenization regexes are unknown
13894
- // the following are close, but not exact. run the following:
13895
- // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
13896
- GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
13915
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
13897
13916
regex_exprs = {
13898
- "\\s?\\p{L}+",
13899
- "\\s?\\p{P}+",
13917
+ "[ ]{2,24}", // the spaces from the added_tokens are split separately
13900
13918
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13901
13919
};
13902
13920
break;
@@ -13909,7 +13927,6 @@ struct llm_tokenizer_bpe {
13909
13927
};
13910
13928
break;
13911
13929
case LLAMA_VOCAB_PRE_TYPE_GPT2:
13912
- case LLAMA_VOCAB_PRE_TYPE_OLMO:
13913
13930
regex_exprs = {
13914
13931
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13915
13932
};
@@ -13985,6 +14002,10 @@ struct llm_tokenizer_bpe {
13985
14002
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13986
14003
int final_prev_index = -1;
13987
14004
14005
+ // FIXME: pre-tokenize added_tokens (user-defined tokens) before other pre-tokenization
14006
+ // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
14007
+ // (useful for neox-style tokenizers)
14008
+
13988
14009
const auto word_collection = unicode_regex_split(text, regex_exprs);
13989
14010
13990
14011
symbols_final.clear();
0 commit comments