Skip to content

Commit 04aad94

Browse files
author
jaime-m-p
committed
Update brute force test: special tokens
1 parent 059031b commit 04aad94

File tree

1 file changed

+32
-3
lines changed

1 file changed

+32
-3
lines changed

tests/test-tokenizer-random.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,23 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
153153
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
154154
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
155155
'Cửa Việt', # llama-3, ignore_merges = true
156-
'<s>a', # TODO: Phi-3 fail
156+
'<s>a', # Phi-3 fail
157+
'<unk><|endoftext|><s>' # Phi-3 fail
157158
'a\na', # TODO: Bert fail
158159
]
159160

160161

162+
def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
163+
special_tokens = set(special_tokens)
164+
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
165+
special_tokens = list(sorted(special_tokens))
166+
rand = random.Random()
167+
for m in range(iterations):
168+
rand.seed(m)
169+
words = rand.choices(special_tokens, k=500)
170+
yield "".join(words)
171+
172+
161173
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
162174
"""Brute force check all vocab words"""
163175
yield from vocab
@@ -289,14 +301,31 @@ def func_tokenize1(text: str):
289301
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
290302
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
291303
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
304+
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
292305
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
293306
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
294307
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
295-
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000))
308+
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
296309
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
297310

298311
model.free()
299312

300313

301314
if __name__ == "__main__":
302-
main()
315+
# main()
316+
317+
path_tokenizers = "./models/tokenizers/"
318+
path_vocab_format = "./models/ggml-vocab-%s.gguf"
319+
320+
# import os
321+
# tokenizers = os.listdir(path_tokenizers)
322+
tokenizers = [
323+
"llama-spm", # SPM
324+
"phi-3", # SPM
325+
]
326+
327+
for tokenizer in tokenizers:
328+
print("\n" + "=" * 50 + "\n" + tokenizer + "\n") # noqa
329+
vocab_file = path_vocab_format % tokenizer
330+
dir_tokenizer = path_tokenizers + "/" + tokenizer
331+
main([vocab_file, dir_tokenizer, "--verbose"])

0 commit comments

Comments
 (0)