Skip to content

Commit afc4ca2

Browse files
authored
convert : update convert-new.py with tokenizer fixes (#2614)
* Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows)
1 parent ec1b100 commit afc4ca2

File tree

2 files changed

+61
-45
lines changed

2 files changed

+61
-45
lines changed

convert-new.py

Lines changed: 60 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -205,25 +205,58 @@ def load(model_plus: 'ModelPlus') -> 'Params':
205205
return params
206206

207207

208-
class SentencePieceVocab:
209-
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
210-
self.vocabtype = vocabtype
211-
if self.vocabtype == "bpe":
212-
self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
213-
else:
214-
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
215-
208+
class BpeVocab:
209+
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
210+
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
216211
added_tokens: Dict[str, int]
217212
if fname_added_tokens is not None:
218-
added_tokens = json.load(open(fname_added_tokens))
213+
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
219214
else:
220215
added_tokens = {}
216+
vocab_size: int = len(self.bpe_tokenizer)
217+
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
218+
actual_ids = sorted(added_tokens.values())
219+
if expected_ids != actual_ids:
220+
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
221+
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
222+
self.added_tokens_list = [text for (text, idx) in items]
223+
self.vocab_size_base: int = vocab_size
224+
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
225+
self.fname_tokenizer = fname_tokenizer
226+
self.fname_added_tokens = fname_added_tokens
221227

222-
if self.vocabtype == "bpe":
223-
vocab_size: int = len(self.sentencepiece_tokenizer)
224-
else:
225-
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
228+
def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
229+
tokenizer = self.bpe_tokenizer
230+
from transformers.models.gpt2 import tokenization_gpt2
231+
byte_encoder = tokenization_gpt2.bytes_to_unicode()
232+
byte_decoder = {v: k for k, v in byte_encoder.items()}
233+
for i, item in enumerate(tokenizer):
234+
text: bytes = item.encode("utf-8")
235+
score: float = -i
236+
yield text, score
226237

238+
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
239+
for text in self.added_tokens_list:
240+
score = -1000.0
241+
yield text.encode("utf-8"), score
242+
243+
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
244+
yield from self.bpe_tokens()
245+
yield from self.added_tokens()
246+
247+
def __repr__(self) -> str:
248+
return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
249+
250+
251+
class SentencePieceVocab:
252+
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
253+
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
254+
added_tokens: Dict[str, int]
255+
if fname_added_tokens is not None:
256+
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
257+
else:
258+
added_tokens = {}
259+
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
227260
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
228261
actual_ids = sorted(added_tokens.values())
229262
if expected_ids != actual_ids:
@@ -238,32 +271,11 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vo
238271

239272
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
240273
tokenizer = self.sentencepiece_tokenizer
241-
if self.vocabtype == "bpe":
242-
from transformers.models.gpt2 import tokenization_gpt2
243-
byte_encoder = tokenization_gpt2.bytes_to_unicode()
244-
byte_decoder = {v: k for k, v in byte_encoder.items()}
245-
for i, item in enumerate(tokenizer):
246-
text: bytes
247-
text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
248-
score: float = -i
249-
yield text, score
250-
else:
251-
for i in range(tokenizer.vocab_size()):
252-
text: bytes
253-
if tokenizer.is_unknown(i):
254-
text = " \u2047 ".encode("utf-8")
255-
elif tokenizer.is_control(i):
256-
text = b""
257-
elif tokenizer.is_byte(i):
258-
piece = tokenizer.id_to_piece(i)
259-
if len(piece) != 6:
260-
raise Exception(f"Invalid token: {piece}")
261-
byte_value = int(piece[3:-1], 16)
262-
text = struct.pack("B", byte_value)
263-
else:
264-
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
265-
score: float = tokenizer.get_score(i)
266-
yield text, score
274+
for i in range(tokenizer.vocab_size()):
275+
piece = tokenizer.id_to_piece(i)
276+
text: bytes = piece.encode("utf-8")
277+
score: float = tokenizer.get_score(i)
278+
yield text, score
267279

268280
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
269281
for text in self.added_tokens_list:
@@ -278,7 +290,7 @@ def __repr__(self) -> str:
278290
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
279291

280292

281-
Vocab = Union[SentencePieceVocab]
293+
Vocab = Union[BpeVocab, SentencePieceVocab]
282294

283295

284296
def permute(weights: NDArray, n_head: int) -> NDArray:
@@ -679,7 +691,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
679691

680692
def check_vocab_size(params: Params, vocab: Vocab) -> None:
681693
if params.n_vocab != vocab.vocab_size:
682-
assert isinstance(vocab, SentencePieceVocab)
694+
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
683695
if params.n_vocab == vocab.vocab_size_base:
684696
print("Ignoring added_tokens.json since model matches vocab size without it.")
685697
vocab.added_tokens_list = []
@@ -853,7 +865,7 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
853865
return {name: model[name] for name in TENSORS_LIST if name in model}
854866

855867

856-
def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
868+
def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
857869
print(f"vocabtype: {vocabtype}")
858870
# Be extra-friendly and accept either a file or a directory. Also, if it's
859871
# a directory, it might be the model directory, and tokenizer.model might
@@ -875,8 +887,12 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
875887
"if it's in another directory, pass the directory as --vocab-dir")
876888
added_tokens_path = path.parent / "added_tokens.json"
877889
print(f"Loading vocab file {path}")
878-
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
879-
vocabtype)
890+
if vocabtype == "bpe":
891+
return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
892+
elif vocabtype == "spm":
893+
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
894+
else:
895+
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
880896

881897

882898
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:

examples/main/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
307307
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
308308
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
309309
};
310-
SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
310+
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
311311
#endif
312312

313313
fprintf(stderr, "%s: interactive mode on.\n", __func__);

0 commit comments

Comments
 (0)