@@ -167,13 +167,13 @@ def generator_random_chars(iterations = 100) -> Iterator[str]:
167
167
"""Brute force random text with simple characters"""
168
168
169
169
WHITESPACES = list (" " * 20 + "\n " * 5 + "\r \n " * 5 + "\t " * 5 )
170
- CHARS = list (set ("""
170
+ CHARS = list (sorted ( set ("""
171
171
ABCDEFGHIJKLMNOPQRSTUVWXYZ
172
172
abcdefghijklmnopqrstuvwxyz
173
173
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
174
174
áéíóúàèìòùâêîôûäëïöü
175
175
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\ |@#~½¬~;:_
176
- """ ))
176
+ """ )))
177
177
178
178
rand = random .Random ()
179
179
for m in range (iterations ):
@@ -194,7 +194,7 @@ def generator_random_vocab_chars(vocab: list[str], iterations = 100) -> Iterator
194
194
vocab_chars = set ()
195
195
for word in vocab :
196
196
vocab_chars .update (word )
197
- vocab_chars = list (vocab_chars )
197
+ vocab_chars = list (sorted ( vocab_chars ) )
198
198
199
199
rand = random .Random ()
200
200
for m in range (iterations ):
@@ -260,7 +260,7 @@ def find_first_mismatch(ids1: list[int], ids2: list[int]):
260
260
ids1 = list (ids1 )[max (0 , i - 2 ) : i + 2 + 1 ]
261
261
ids2 = list (ids2 )[max (0 , i - 2 ) : i + 2 + 1 ]
262
262
text2 = tokenizer .decode (ids2 , skip_special_tokens = True )
263
- assert (text2 in text )
263
+ # assert (text2 in text)
264
264
logger .info (" Text: " + repr (text2 ))
265
265
logger .info (" TokenIDs: " + str (ids1 ))
266
266
logger .info (" Expected: " + str (ids2 ))
@@ -288,7 +288,7 @@ def func_tokenize2(text:str):
288
288
def func_tokenize1 (text :str ):
289
289
return model .tokenize (text , add_special = False , parse_special = parse_special )
290
290
291
- vocab = tokenizer .batch_decode (list (tokenizer .get_vocab ().values ()), skip_special_tokens = True )
291
+ vocab = list ( sorted ( tokenizer .batch_decode (list (tokenizer .get_vocab ().values ()), skip_special_tokens = True )) )
292
292
test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_custom_text ())
293
293
test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_custom_text_edge_cases ())
294
294
test_compare_tokenizer (func_tokenize1 , func_tokenize2 , generator_vocab_words (vocab ))
0 commit comments