Remove qwen and fix mauled imports

teleprint-me · teleprint-me · commit 932ab05d6937 · 2024-05-12T21:44:31.000-04:00
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -77,7 +77,6 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "mixtral-spm",    "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", },
     {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
     {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen-7B", },
     {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
     {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
     {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@@ -126,28 +125,11 @@ def download_file_with_auth(url, token, save_path):
     logger.info(f"Downloading {name} to {model_name_or_path}")
 
     # model and repo urls are not the same
-    # url = "https://huggingface.co/Qwen/Qwen-tokenizer/raw/main/tokenizer.json"
-    if name == "qwen":  # qwen is an outlier and will raise a FileNotFoundError
-        # override the tokenizer path
-        model_tokenizer_path = f"{model_name_or_path}/qwen.tiktoken"
-        # fetch the qwens BPE tokenizer
-        download_file_with_auth(
-            url="https://huggingface.co/Qwen/Qwen-7B/raw/main/qwen.tiktoken",
-            token=token,
-            save_path=model_tokenizer_path
-        )
-        # fetch qwens tokenizer script; this is required.
-        download_file_with_auth(
-            url="https://huggingface.co/Qwen/Qwen-7B/raw/main/tokenization_qwen.py",
-            token=token,
-            save_path=f"{model_name_or_path}/tokenization_qwen.py"
-        )
-    else:  # Get the models tokenizer
-        download_file_with_auth(
-            url=f"{url_resolve}/tokenizer.json",
-            token=token,
-            save_path=model_tokenizer_path
-        )
+    download_file_with_auth(
+        url=f"{url_resolve}/tokenizer.json",
+        token=token,
+        save_path=model_tokenizer_path
+    )
 
     # Get the models hyper params
     download_file_with_auth(
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -11,6 +11,7 @@
 import sys
 from enum import IntEnum
 from hashlib import sha256
+from pathlib import Path
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -22,16 +23,6 @@
     TypeVar,
     cast,
 )
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    ContextManager,
-    Iterable,
-    Iterator,
-    Sequence,
-    TypeVar,
-    cast,
-)
 
 import numpy as np
 import torch
@@ -495,9 +486,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
             # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
             res = "command-r"
-        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
-            # ref: https://huggingface.co/Qwen/Qwen-7B
-            res = "qwen"
         if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
             # ref: https://huggingface.co/Qwen/Qwen1.5-7B
             res = "qwen2"
diff --git a/generate-vocab.sh b/generate-vocab.sh
@@ -18,6 +18,9 @@ python3 convert-hf-to-gguf.py models/tokenizers/mixtral-bpe --outfile models/ggm
 python3 convert-hf-to-gguf.py models/tokenizers/mixtral-spm --outfile models/ggml-vocab-mixtral-spm.gguf --vocab-only
 python3 convert-hf-to-gguf.py models/tokenizers/refact --outfile models/ggml-vocab-refact.gguf --vocab-only
 python3 convert-hf-to-gguf.py models/tokenizers/command-r --outfile models/ggml-vocab-command-r.gguf --vocab-only
-python3 convert-hf-to-gguf.py models/tokenizers/qwen --outfile models/ggml-vocab-qwen.gguf --vocab-only
 python3 convert-hf-to-gguf.py models/tokenizers/qwen2 --outfile models/ggml-vocab-qwen2.gguf --vocab-only
 python3 convert-hf-to-gguf.py models/tokenizers/olmo --outfile models/ggml-vocab-olmo.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/dbrx --outfile models/ggml-vocab-dbrx.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-en --outfile models/ggml-vocab-jina-en.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-es --outfile models/ggml-vocab-jina-es.gguf --vocab-only
+python3 convert-hf-to-gguf.py models/tokenizers/jina-de --outfile models/ggml-vocab-jina-de.gguf --vocab-only
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -84,7 +84,6 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen              ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
 
 # build test-tokenizer-1-bpe target once and add many tests