Skip to content

Commit 932ab05

Browse files
committed
Remove qwen and fix mauled imports
1 parent fc0007e commit 932ab05

File tree

4 files changed

+10
-38
lines changed

4 files changed

+10
-38
lines changed

convert-hf-to-gguf-update.py

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ class TOKENIZER_TYPE(IntEnum):
7777
{"name": "mixtral-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
7878
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
7979
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
80-
{"name": "qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen-7B", },
8180
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
8281
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
8382
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@@ -126,28 +125,11 @@ def download_file_with_auth(url, token, save_path):
126125
logger.info(f"Downloading {name} to {model_name_or_path}")
127126

128127
# model and repo urls are not the same
129-
# url = "https://huggingface.co/Qwen/Qwen-tokenizer/raw/main/tokenizer.json"
130-
if name == "qwen": # qwen is an outlier and will raise a FileNotFoundError
131-
# override the tokenizer path
132-
model_tokenizer_path = f"{model_name_or_path}/qwen.tiktoken"
133-
# fetch the qwens BPE tokenizer
134-
download_file_with_auth(
135-
url="https://huggingface.co/Qwen/Qwen-7B/raw/main/qwen.tiktoken",
136-
token=token,
137-
save_path=model_tokenizer_path
138-
)
139-
# fetch qwens tokenizer script; this is required.
140-
download_file_with_auth(
141-
url="https://huggingface.co/Qwen/Qwen-7B/raw/main/tokenization_qwen.py",
142-
token=token,
143-
save_path=f"{model_name_or_path}/tokenization_qwen.py"
144-
)
145-
else: # Get the models tokenizer
146-
download_file_with_auth(
147-
url=f"{url_resolve}/tokenizer.json",
148-
token=token,
149-
save_path=model_tokenizer_path
150-
)
128+
download_file_with_auth(
129+
url=f"{url_resolve}/tokenizer.json",
130+
token=token,
131+
save_path=model_tokenizer_path
132+
)
151133

152134
# Get the models hyper params
153135
download_file_with_auth(

convert-hf-to-gguf.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import sys
1212
from enum import IntEnum
1313
from hashlib import sha256
14+
from pathlib import Path
1415
from typing import (
1516
TYPE_CHECKING,
1617
Any,
@@ -22,16 +23,6 @@
2223
TypeVar,
2324
cast,
2425
)
25-
TYPE_CHECKING,
26-
Any,
27-
Callable,
28-
ContextManager,
29-
Iterable,
30-
Iterator,
31-
Sequence,
32-
TypeVar,
33-
cast,
34-
)
3526

3627
import numpy as np
3728
import torch
@@ -495,9 +486,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
495486
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
496487
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
497488
res = "command-r"
498-
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
499-
# ref: https://huggingface.co/Qwen/Qwen-7B
500-
res = "qwen"
501489
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
502490
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
503491
res = "qwen2"

generate-vocab.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ python3 convert-hf-to-gguf.py models/tokenizers/mixtral-bpe --outfile models/ggm
1818
python3 convert-hf-to-gguf.py models/tokenizers/mixtral-spm --outfile models/ggml-vocab-mixtral-spm.gguf --vocab-only
1919
python3 convert-hf-to-gguf.py models/tokenizers/refact --outfile models/ggml-vocab-refact.gguf --vocab-only
2020
python3 convert-hf-to-gguf.py models/tokenizers/command-r --outfile models/ggml-vocab-command-r.gguf --vocab-only
21-
python3 convert-hf-to-gguf.py models/tokenizers/qwen --outfile models/ggml-vocab-qwen.gguf --vocab-only
2221
python3 convert-hf-to-gguf.py models/tokenizers/qwen2 --outfile models/ggml-vocab-qwen2.gguf --vocab-only
2322
python3 convert-hf-to-gguf.py models/tokenizers/olmo --outfile models/ggml-vocab-olmo.gguf --vocab-only
23+
python3 convert-hf-to-gguf.py models/tokenizers/dbrx --outfile models/ggml-vocab-dbrx.gguf --vocab-only
24+
python3 convert-hf-to-gguf.py models/tokenizers/jina-en --outfile models/ggml-vocab-jina-en.gguf --vocab-only
25+
python3 convert-hf-to-gguf.py models/tokenizers/jina-es --outfile models/ggml-vocab-jina-es.gguf --vocab-only
26+
python3 convert-hf-to-gguf.py models/tokenizers/jina-de --outfile models/ggml-vocab-jina-de.gguf --vocab-only

tests/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE
8484
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
8585
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
8686
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
87-
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen.gguf)
8887
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
8988

9089
# build test-tokenizer-1-bpe target once and add many tests

0 commit comments

Comments
 (0)