@@ -153,11 +153,23 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
153
153
'Ⅵ-a' , # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
154
154
'\uFEFF //' , # unicode_ranges_control, 0xFEFF (BOM)
155
155
'Cửa Việt' , # llama-3, ignore_merges = true
156
- '<s>a' , # TODO: Phi-3 fail
156
+ '<s>a' , # Phi-3 fail
157
+ '<unk><|endoftext|><s>' # Phi-3 fail
157
158
'a\n a' , # TODO: Bert fail
158
159
]
159
160
160
161
162
+ def generator_random_special_tokens (special_tokens :list [str ], iterations = 100 ) -> Iterator [str ]:
163
+ special_tokens = set (special_tokens )
164
+ special_tokens .update ([" " , "\n " , "\t " , "-" , "!" , "one" , "1" , "<s>" , "</s>" ])
165
+ special_tokens = list (sorted (special_tokens ))
166
+ rand = random .Random ()
167
+ for m in range (iterations ):
168
+ rand .seed (m )
169
+ words = rand .choices (special_tokens , k = 500 )
170
+ yield "" .join (words )
171
+
172
+
161
173
def generator_vocab_words (vocab : list [str ]) -> Iterator [str ]:
162
174
"""Brute force check all vocab words"""
163
175
yield from vocab
@@ -289,14 +301,31 @@ def func_tokenize1(text: str):
289
301
vocab = list (sorted (tokenizer .batch_decode (list (tokenizer .get_vocab ().values ()), skip_special_tokens = True )))
290
302
test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_custom_text ())
291
303
test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_custom_text_edge_cases ())
304
+ test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_special_tokens (tokenizer .all_special_tokens , 10_000 ))
292
305
test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_vocab_words (vocab ))
293
306
test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_chars (10_000 ))
294
307
test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_vocab_chars (vocab , 10_000 ))
295
- test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_vocab_words (vocab , 10_000 ))
308
+ test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_random_vocab_words (vocab , 5_000 ))
296
309
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
297
310
298
311
model .free ()
299
312
300
313
301
314
if __name__ == "__main__" :
302
- main ()
315
+ # main()
316
+
317
+ path_tokenizers = "./models/tokenizers/"
318
+ path_vocab_format = "./models/ggml-vocab-%s.gguf"
319
+
320
+ # import os
321
+ # tokenizers = os.listdir(path_tokenizers)
322
+ tokenizers = [
323
+ "llama-spm" , # SPM
324
+ "phi-3" , # SPM
325
+ ]
326
+
327
+ for tokenizer in tokenizers :
328
+ print ("\n " + "=" * 50 + "\n " + tokenizer + "\n " ) # noqa
329
+ vocab_file = path_vocab_format % tokenizer
330
+ dir_tokenizer = path_tokenizers + "/" + tokenizer
331
+ main ([vocab_file , dir_tokenizer , "--verbose" ])
0 commit comments