Skip to content

Commit ae342c7

Browse files
committed
Revert "Revert "BERT tokenizer fixes (ggml-org#6498)""
This reverts commit 82e6483.
1 parent 65d8bc1 commit ae342c7

File tree

20 files changed

+524
-249
lines changed

20 files changed

+524
-249
lines changed

common/common.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2821,23 +2821,23 @@ void llama_batch_add(
28212821
std::vector<llama_token> llama_tokenize(
28222822
const struct llama_context * ctx,
28232823
const std::string & text,
2824-
bool add_bos,
2825-
bool special) {
2826-
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
2824+
bool add_special,
2825+
bool parse_special) {
2826+
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
28272827
}
28282828

28292829
std::vector<llama_token> llama_tokenize(
28302830
const struct llama_model * model,
28312831
const std::string & text,
2832-
bool add_bos,
2833-
bool special) {
2832+
bool add_special,
2833+
bool parse_special) {
28342834
// upper limit for the number of tokens
2835-
int n_tokens = text.length() + add_bos;
2835+
int n_tokens = text.length() + 2 * add_special;
28362836
std::vector<llama_token> result(n_tokens);
2837-
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
2837+
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
28382838
if (n_tokens < 0) {
28392839
result.resize(-n_tokens);
2840-
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
2840+
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
28412841
GGML_ASSERT(check == -n_tokens);
28422842
} else {
28432843
result.resize(n_tokens);

common/common.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -312,14 +312,14 @@ void llama_batch_add(
312312
std::vector<llama_token> llama_tokenize(
313313
const struct llama_context * ctx,
314314
const std::string & text,
315-
bool add_bos,
316-
bool special = false);
315+
bool add_special,
316+
bool parse_special = false);
317317

318318
std::vector<llama_token> llama_tokenize(
319319
const struct llama_model * model,
320320
const std::string & text,
321-
bool add_bos,
322-
bool special = false);
321+
bool add_special,
322+
bool parse_special = false);
323323

324324
// tokenizes a token into a piece, optionally renders special/control tokens
325325
// should work similar to Python's `tokenizer.id_to_piece`

convert-hf-to-gguf.py

Lines changed: 58 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -369,14 +369,27 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
369369
except KeyError:
370370
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
371371

372+
def _is_model_safetensors(self) -> bool:
373+
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
374+
375+
def _get_part_names(self):
376+
if self.is_safetensors:
377+
if self.num_parts == 1: # there's only one .safetensors file
378+
return ("model.safetensors",)
379+
return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
380+
381+
if self.num_parts == 1: # there's only one .bin file
382+
return ("pytorch_model.bin",)
383+
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
384+
372385
# used for GPT-2 BPE and WordPiece vocabs
373386
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
374387
tokens: list[str] = []
375388
toktypes: list[int] = []
376389

377390
from transformers import AutoTokenizer
378-
tokenizer = AutoTokenizer.from_pretrained(dir_model)
379-
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
391+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
392+
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
380393
assert max(tokenizer.vocab.values()) < vocab_size
381394

382395
tokpre = self.get_vocab_base_pre(tokenizer)
@@ -403,7 +416,6 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
403416
# NOTE: this function is generated by convert-hf-to-gguf-update.py
404417
# do not modify it manually!
405418
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
406-
# Marker: Start get_vocab_base_pre
407419
def get_vocab_base_pre(self, tokenizer) -> str:
408420
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
409421
# is specific for the BPE pre-tokenizer used by the model
@@ -415,14 +427,13 @@ def get_vocab_base_pre(self, tokenizer) -> str:
415427
chktok = tokenizer.encode(chktxt)
416428
chkhsh = sha256(str(chktok).encode()).hexdigest()
417429

418-
logger.debug(f"chktok: {chktok}")
419-
logger.debug(f"chkhsh: {chkhsh}")
430+
print(f"chktok: {chktok}")
431+
print(f"chkhsh: {chkhsh}")
420432

421433
res = None
422434

423-
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
424-
# or pull the latest version of the model from Huggingface
425-
# don't edit the hashes manually!
435+
# NOTE: if you get an error here, you need to add the model to the if-elif chain below
436+
# don't do this manually - use the convert-hf-to-gguf-update.py script!
426437
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
427438
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
428439
res = "llama-bpe"
@@ -447,60 +458,23 @@ def get_vocab_base_pre(self, tokenizer) -> str:
447458
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
448459
# ref: https://huggingface.co/openai-community/gpt2
449460
res = "gpt-2"
450-
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
451-
# ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
452-
res = "stablelm2"
453-
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
454-
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
455-
res = "refact"
456-
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
457-
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
458-
res = "command-r"
459-
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
460-
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
461-
res = "qwen2"
462-
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
463-
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
464-
res = "olmo"
465-
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
466-
# ref: https://huggingface.co/databricks/dbrx-base
467-
res = "dbrx"
468-
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
469-
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
470-
res = "jina-v2-en"
471-
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
472-
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
473-
res = "jina-v2-es"
474-
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
475-
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
476-
res = "jina-v2-de"
477-
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
478-
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
479-
res = "smaug-bpe"
480-
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
481-
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
482-
res = "jina-v2-code"
483461

484462
if res is None:
485-
logger.warning("\n")
486-
logger.warning("**************************************************************************************")
487-
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
488-
logger.warning("** There are 2 possible reasons for this:")
489-
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
490-
logger.warning("** - the pre-tokenization config has changed upstream")
491-
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
492-
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
493-
logger.warning("**")
494-
logger.warning(f"** chkhsh: {chkhsh}")
495-
logger.warning("**************************************************************************************")
496-
logger.warning("\n")
463+
print("\n")
464+
print("**************************************************************************************")
465+
print("** WARNING: The BPE pre-tokenizer was not recognized!")
466+
print("** This means that it was not added yet or you are using an older version.")
467+
print("** Check convert-hf-to-gguf-update.py and update it accordingly.")
468+
print("**")
469+
print(f"** chkhsh: {chkhsh}")
470+
print("**************************************************************************************")
471+
print("\n")
497472
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
498473

499-
logger.debug(f"tokenizer.ggml.pre: {repr(res)}")
500-
logger.debug(f"chkhsh: {chkhsh}")
474+
print(f"tokenizer.ggml.pre: {res}")
475+
print(f"chkhsh: {chkhsh}")
501476

502477
return res
503-
# Marker: End get_vocab_base_pre
504478

505479
def _set_vocab_gpt2(self) -> None:
506480
tokens, toktypes, tokpre = self.get_vocab_base()
@@ -509,7 +483,7 @@ def _set_vocab_gpt2(self) -> None:
509483
self.gguf_writer.add_token_list(tokens)
510484
self.gguf_writer.add_token_types(toktypes)
511485

512-
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
486+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
513487
special_vocab.add_to_gguf(self.gguf_writer)
514488

515489
def _set_vocab_qwen(self):
@@ -2163,35 +2137,26 @@ def set_gguf_parameters(self):
21632137
self.gguf_writer.add_pooling_type(pooling_type)
21642138

21652139
def set_vocab(self):
2166-
# use huggingface vocab to get all tokens
2167-
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
2168-
tokens, scores, toktypes = zip(*vocab.all_tokens())
2169-
assert len(tokens) == vocab.vocab_size
2170-
self.vocab_size = vocab.vocab_size
2140+
tokens, toktypes, tokpre = self.get_vocab_base()
2141+
self.vocab_size = len(tokens)
21712142

21722143
# we need this to validate the size of the token_type embeddings
21732144
# though currently we are passing all zeros to the token_type embeddings
2174-
n_token_types = len(set(toktypes))
2175-
self.gguf_writer.add_token_type_count(n_token_types)
2145+
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
21762146

21772147
# convert to phantom space vocab
2178-
def phantom(tok, typ):
2179-
if tok.startswith(b"[") and tok.endswith(b"]"):
2148+
def phantom(tok):
2149+
if tok.startswith("[") and tok.endswith("]"):
21802150
return tok
2181-
if tok.startswith(b"##"):
2151+
if tok.startswith("##"):
21822152
return tok[2:]
2183-
return b"\xe2\x96\x81" + tok
2184-
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
2185-
2186-
# set up bos and eos tokens (cls and sep)
2187-
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
2188-
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
2153+
return "\u2581" + tok
2154+
tokens = list(map(phantom, tokens))
21892155

21902156
# add vocab to gguf
21912157
self.gguf_writer.add_tokenizer_model("bert")
21922158
self.gguf_writer.add_tokenizer_pre(tokpre)
21932159
self.gguf_writer.add_token_list(tokens)
2194-
self.gguf_writer.add_token_scores(scores)
21952160
self.gguf_writer.add_token_types(toktypes)
21962161

21972162
# handle special tokens
@@ -2237,16 +2202,6 @@ def set_gguf_parameters(self):
22372202
super().set_gguf_parameters()
22382203
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
22392204

2240-
def get_tensors(self):
2241-
assert self.vocab_size is not None
2242-
for name, data in super().get_tensors():
2243-
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
2244-
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
2245-
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
2246-
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
2247-
data = data[:self.vocab_size, :]
2248-
yield name, data
2249-
22502205

22512206
@Model.register("GemmaForCausalLM")
22522207
class GemmaModel(Model):
@@ -2409,15 +2364,25 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
24092364
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
24102365
del n_dims # unused
24112366

2412-
return bid is not None and new_name in (
2413-
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
2414-
gguf.MODEL_TENSOR.SSM_CONV1D,
2415-
gguf.MODEL_TENSOR.SSM_X,
2416-
gguf.MODEL_TENSOR.SSM_DT,
2417-
gguf.MODEL_TENSOR.SSM_A,
2418-
gguf.MODEL_TENSOR.SSM_D,
2419-
]
2420-
)
2367+
n_dims = len(data.shape)
2368+
data_dtype = data.dtype
2369+
2370+
# if f32 desired, convert any float16 to float32
2371+
if self.ftype == 0 and data_dtype == np.float16:
2372+
data = data.astype(np.float32)
2373+
2374+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2375+
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
2376+
data = data.astype(np.float32)
2377+
2378+
# if f16 desired, convert big float32 2-dim weight tensors to float16
2379+
new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
2380+
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
2381+
data = data.astype(np.float16)
2382+
2383+
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
2384+
2385+
self.gguf_writer.add_tensor(new_name, data)
24212386

24222387

24232388
@Model.register("CohereForCausalLM")

convert-persimmon-to-gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#!/usr/bin/env python3
2+
from __future__ import annotations
3+
24
import argparse
35
import os
46
import sys

0 commit comments

Comments
 (0)