File tree Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Original file line number Diff line number Diff line change @@ -12208,7 +12208,6 @@ struct llm_tokenizer_bpe {
12208
12208
switch (vocab.type_pre) {
12209
12209
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12210
12210
ignore_merges = true;
12211
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12212
12211
word_collection = unicode_regex_split(text, {
12213
12212
// original regex from tokenizer.json
12214
12213
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12217,6 +12216,12 @@ struct llm_tokenizer_bpe {
12217
12216
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12218
12217
});
12219
12218
break;
12219
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
12220
+ word_collection = unicode_regex_split(text, {
12221
+ // same as llama3
12222
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12223
+ });
12224
+ break;
12220
12225
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12221
12226
word_collection = unicode_regex_split(text, {
12222
12227
"[\r\n]",
You can’t perform that action at this time.
0 commit comments