refact : add tokenizer model

ggerganov · ggerganov · commit d53240ccc24d · 2024-05-03T17:27:12.000+03:00
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -56,6 +56,7 @@ class TOKENIZER_TYPE(IntEnum):
         { "name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
         { "name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
         { "name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+        { "name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
         ]
 
 # make directory "models/tokenizers" if it doesn't exist
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -306,6 +306,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
             # ref: https://huggingface.co/openai-community/gpt2
             res = "gpt-2"
+        if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
+            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
+            res = "refact"
 
         if res is None:
             print("\n")
diff --git a/llama.cpp b/llama.cpp
@@ -4383,6 +4383,9 @@ static void llm_load_vocab(
             } else if (
                     tokenizer_pre == "gpt-2") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "refact") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -12234,6 +12237,7 @@ struct llm_tokenizer_bpe {
                         });
                         break;
                     case LLAMA_VOCAB_PRE_TYPE_STARCODER:
+                    case LLAMA_VOCAB_PRE_TYPE_REFACT:
                         word_collection = unicode_regex_split(text, {
                             "\\p{N}",
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
diff --git a/llama.h b/llama.h
@@ -79,6 +79,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
         LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
         LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
     };
 
     // note: these values should be synchronized with ggml_rope
diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@ class TOKENIZER_TYPE(IntEnum):`
`56`	`56`	`{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },`
`57`	`57`	`{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },`
`58`	`58`	`{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },`
	`59`	`+ { "name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },`
`59`	`60`	`]`
`60`	`61`
`61`	`62`	`# make directory "models/tokenizers" if it doesn't exist`