Skip to content

Commit a355afa

Browse files
kustaayakustaaya
authored andcommitted
Added support for Viking pre-tokenizer (ggml-org#8135)
Co-authored-by: kustaaya <[email protected]>
1 parent 72633df commit a355afa

File tree

4 files changed

+14
-0
lines changed

4 files changed

+14
-0
lines changed

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class TOKENIZER_TYPE(IntEnum):
8585
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
8686
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
8787
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
88+
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
8889
]
8990

9091

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
487487
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
488488
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
489489
res = "jina-v2-code"
490+
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
491+
# ref: https://huggingface.co/LumiOpen/Viking-7B
492+
res = "viking"
490493

491494
if res is None:
492495
logger.warning("\n")

llama.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5193,6 +5193,9 @@ static void llm_load_vocab(
51935193
} else if (
51945194
tokenizer_pre == "poro-chat") {
51955195
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
5196+
} else if (
5197+
tokenizer_pre == "viking") {
5198+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
51965199
} else {
51975200
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
51985201
}
@@ -14078,6 +14081,12 @@ struct llm_tokenizer_bpe {
1407814081
" ?[^(\\s|.,!?…。,、।۔،)]+",
1407914082
};
1408014083
break;
14084+
case LLAMA_VOCAB_PRE_TYPE_VIKING:
14085+
regex_exprs = {
14086+
"\\p{N}",
14087+
" ?[^(\\s|.,!?…。,、।۔،)]+",
14088+
};
14089+
break;
1408114090
default:
1408214091
// default regex for BPE tokenization pre-processing
1408314092
regex_exprs = {

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ extern "C" {
8888
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
8989
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
9090
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
91+
LLAMA_VOCAB_PRE_TYPE_VIKING = 16,
9192
};
9293

9394
// note: these values should be synchronized with ggml_rope

0 commit comments

Comments
 (0)