Skip to content

Commit a89252a

Browse files
committed
Add Viking-7B tokenizer support
1 parent 8f161b2 commit a89252a

File tree

4 files changed

+14
-3
lines changed

4 files changed

+14
-3
lines changed

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ class TOKENIZER_TYPE(IntEnum):
8282
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
8383
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
8484
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
85+
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Same for 13B and 33B
8586
]
8687

8788

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
476476
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
477477
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
478478
res = "smaug-bpe"
479+
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
480+
# ref: https://huggingface.co/LumiOpen/Viking-7B
481+
res = "viking"
479482

480483
if res is None:
481484
logger.warning("\n")

llama.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4549,9 +4549,10 @@ static void llm_load_vocab(
45494549
tokenizer_pre == "default") {
45504550
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
45514551
} else if (
4552-
tokenizer_pre == "llama3" ||
4553-
tokenizer_pre == "llama-v3" ||
4554-
tokenizer_pre == "llama-bpe") {
4552+
tokenizer_pre == "llama3" ||
4553+
tokenizer_pre == "llama-v3" ||
4554+
tokenizer_pre == "llama-bpe" ||
4555+
tokenizer_pre == "viking-7b") {
45554556
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
45564557
} else if (
45574558
tokenizer_pre == "deepseek-llm") {
@@ -12580,6 +12581,11 @@ struct llm_tokenizer_bpe {
1258012581
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
1258112582
});
1258212583
break;
12584+
case LLAMA_VOCAB_PRE_TYPE_VIKING:
12585+
word_collection = unicode_regex_split(text, {
12586+
" ?[^(\\s|[.,!?…。,、।۔،])]+",
12587+
});
12588+
break;
1258312589
default:
1258412590
// default regex for BPE tokenization pre-processing
1258512591
word_collection = unicode_regex_split(text, {

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ extern "C" {
8686
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
8787
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
8888
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
89+
LLAMA_VOCAB_PRE_TYPE_VIKING = 15,
8990
};
9091

9192
// note: these values should be synchronized with ggml_rope

0 commit comments

Comments
 (0)