Skip to content

Commit d53240c

Browse files
committed
refact : add tokenizer model
1 parent cd7c728 commit d53240c

File tree

5 files changed

+9
-0
lines changed

5 files changed

+9
-0
lines changed

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class TOKENIZER_TYPE(IntEnum):
5656
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
5757
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
5858
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
59+
{ "name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
5960
]
6061

6162
# make directory "models/tokenizers" if it doesn't exist

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
306306
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
307307
# ref: https://huggingface.co/openai-community/gpt2
308308
res = "gpt-2"
309+
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
310+
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
311+
res = "refact"
309312

310313
if res is None:
311314
print("\n")

llama.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4383,6 +4383,9 @@ static void llm_load_vocab(
43834383
} else if (
43844384
tokenizer_pre == "gpt-2") {
43854385
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
4386+
} else if (
4387+
tokenizer_pre == "refact") {
4388+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
43864389
} else {
43874390
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
43884391
}
@@ -12234,6 +12237,7 @@ struct llm_tokenizer_bpe {
1223412237
});
1223512238
break;
1223612239
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12240+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
1223712241
word_collection = unicode_regex_split(text, {
1223812242
"\\p{N}",
1223912243
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ extern "C" {
7979
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
8080
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
8181
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
8283
};
8384

8485
// note: these values should be synchronized with ggml_rope

models/ggml-vocab-refact.gguf

44 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)