Skip to content

Commit 7bfc01b

Browse files
committed
Add BPE pre-tokenization for Command-R/R+.
1 parent a2ac89d commit 7bfc01b

File tree

5 files changed

+22
-0
lines changed

5 files changed

+22
-0
lines changed

convert-hf-to-gguf-update.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ class TOKENIZER_TYPE(IntEnum):
6262
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
6363
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
6464
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
65+
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
6566
]
6667

6768
# make directory "models/tokenizers" if it doesn't exist
@@ -102,6 +103,14 @@ def download_file_with_auth(url, token, save_path):
102103
save_path = f"models/tokenizers/{name}/tokenizer.json"
103104
download_file_with_auth(url, token, save_path)
104105

106+
# if downloaded file is less than 1KB, we likely need to download an LFS instead
107+
if os.path.getsize(save_path) < 1024:
108+
# remove the file
109+
os.remove(save_path)
110+
url = f"{repo}/resolve/main/tokenizer.json"
111+
save_path = f"models/tokenizers/{name}/tokenizer.json"
112+
download_file_with_auth(url, token, save_path)
113+
105114
if tokt == TOKENIZER_TYPE.SPM:
106115
url = f"{repo}/resolve/main/tokenizer.model"
107116
save_path = f"models/tokenizers/{name}/tokenizer.model"

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
308308
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
309309
# ref: https://huggingface.co/openai-community/gpt2
310310
res = "gpt-2"
311+
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
312+
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
313+
res = "command-r"
311314

312315
if res is None:
313316
logger.warning("\n")

llama.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4383,6 +4383,9 @@ static void llm_load_vocab(
43834383
} else if (
43844384
tokenizer_pre == "gpt-2") {
43854385
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4386+
} else if (
4387+
tokenizer_pre == "command-r") {
4388+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
43864389
} else {
43874390
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
43884391
}
@@ -12240,6 +12243,11 @@ struct llm_tokenizer_bpe {
1224012243
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
1224112244
});
1224212245
break;
12246+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
12247+
word_collection = unicode_regex_split(text, {
12248+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12249+
});
12250+
break;
1224312251
default:
1224412252
// default regex for BPE tokenization pre-processing
1224512253
word_collection = unicode_regex_split(text, {

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ extern "C" {
7979
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
8080
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
8181
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 8,
8283
};
8384

8485
// note: these values should be synchronized with ggml_rope

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE
8181
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
8282
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
8383
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
84+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
8485

8586
# build test-tokenizer-1-bpe target once and add many tests
8687
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)

0 commit comments

Comments
 (0)