Skip to content

Commit 69f815d

Browse files
committed
Add Viking-7B tokenizer support
1 parent 9afdffe commit 69f815d

File tree

3 files changed

+8
-3
lines changed

3 files changed

+8
-3
lines changed

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ class TOKENIZER_TYPE(IntEnum):
7777
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
7878
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
7979
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
80+
{"name": "viking-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B" },
8081
]
8182

8283
# make directory "models/tokenizers" if it doesn't exist

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
469469
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
470470
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
471471
res = "jina-v2-de"
472+
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
473+
# ref: https://huggingface.co/LumiOpen/Viking-7B
474+
res = "viking-7b"
472475

473476
if res is None:
474477
logger.warning("\n")

llama.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4426,9 +4426,10 @@ static void llm_load_vocab(
44264426
tokenizer_pre == "default") {
44274427
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
44284428
} else if (
4429-
tokenizer_pre == "llama3" ||
4430-
tokenizer_pre == "llama-v3" ||
4431-
tokenizer_pre == "llama-bpe") {
4429+
tokenizer_pre == "llama3" ||
4430+
tokenizer_pre == "llama-v3" ||
4431+
tokenizer_pre == "llama-bpe" ||
4432+
tokenizer_pre == "viking-7b") {
44324433
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
44334434
} else if (
44344435
tokenizer_pre == "deepseek-llm") {

0 commit comments

Comments
 (0)