Skip to content

Commit 707a08d

Browse files
author
jaime-m-p
committed
Deterministic brute force random test
1 parent bb205ee commit 707a08d

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

tests/test-tokenizer-random.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,13 +167,13 @@ def generator_random_chars(iterations = 100) -> Iterator[str]:
167167
"""Brute force random text with simple characters"""
168168

169169
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
170-
CHARS = list(set("""
170+
CHARS = list(sorted(set("""
171171
ABCDEFGHIJKLMNOPQRSTUVWXYZ
172172
abcdefghijklmnopqrstuvwxyz
173173
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
174174
áéíóúàèìòùâêîôûäëïöü
175175
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
176-
"""))
176+
""")))
177177

178178
rand = random.Random()
179179
for m in range(iterations):
@@ -194,7 +194,7 @@ def generator_random_vocab_chars(vocab: list[str], iterations = 100) -> Iterator
194194
vocab_chars = set()
195195
for word in vocab:
196196
vocab_chars.update(word)
197-
vocab_chars = list(vocab_chars)
197+
vocab_chars = list(sorted(vocab_chars))
198198

199199
rand = random.Random()
200200
for m in range(iterations):
@@ -260,7 +260,7 @@ def find_first_mismatch(ids1: list[int], ids2: list[int]):
260260
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
261261
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
262262
text2 = tokenizer.decode(ids2, skip_special_tokens=True)
263-
assert (text2 in text)
263+
#assert (text2 in text)
264264
logger.info(" Text: " + repr(text2))
265265
logger.info(" TokenIDs: " + str(ids1))
266266
logger.info(" Expected: " + str(ids2))
@@ -288,7 +288,7 @@ def func_tokenize2(text:str):
288288
def func_tokenize1(text:str):
289289
return model.tokenize(text, add_special=False, parse_special=parse_special)
290290

291-
vocab = tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)
291+
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
292292
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
293293
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
294294
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))

0 commit comments

Comments
 (0)