Skip to content

Commit 7761f8e

Browse files
author
jaime-m-p
committed
Ignore special tokens for testing
1 parent def3d13 commit 7761f8e

File tree

1 file changed

+17
-14
lines changed

1 file changed

+17
-14
lines changed

tests/test-tokenizer-random-bpe.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ def find_first_mismatch(ids1: list[int], ids2: list[int]):
9696
for i, (a,b) in enumerate(zip(ids1, ids2)):
9797
if a != b:
9898
return i
99-
return -1 if len(ids1) == len(ids2) else i
99+
if len(ids1) == len(ids2):
100+
return -1
101+
return min(len(ids1), len(ids2))
100102

101103

102104
def test_custom_texts(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase):
@@ -152,11 +154,12 @@ def test_custom_texts(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase):
152154
'a 〇b', # unicode_ranges_digit, 0x3007
153155
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
154156
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
157+
'<s>a' # TODO: Phi-3 fail
155158
]
156159

157160
for text in tests + more_tests:
158-
ids1 = model.tokenize(text, parse_special=True)
159-
ids2 = tokenizer.encode(text)
161+
ids1 = model.tokenize(text, add_special=False, parse_special=False)
162+
ids2 = tokenizer.encode(text, add_special_tokens=False)
160163
logger.info(repr(text))
161164
if ids1 != ids2:
162165
logger.info(" TokenIDs: " + str(list(ids1)))
@@ -165,7 +168,7 @@ def test_custom_texts(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase):
165168
raise Exception()
166169

167170

168-
def test_random_chars(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, iterations=100):
171+
def test_random_chars(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, iterations = 100):
169172

170173
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
171174
CHARS = list(set("""
@@ -192,12 +195,12 @@ def test_random_chars(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase,
192195
text.append("".join(word) + space)
193196
text = "".join(text)
194197

195-
ids1 = model.tokenize(text, parse_special=True)
196-
ids2 = tokenizer.encode(text)
198+
ids1 = model.tokenize(text, add_special=False, parse_special=False)
199+
ids2 = tokenizer.encode(text, add_special_tokens=False)
197200
assert(ids1 == ids2)
198201

199202

200-
def test_random_vocab_chars(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, iterations=100):
203+
def test_random_vocab_chars(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, iterations = 100):
201204

202205
logger.info("Building vocab char list ...")
203206
vocab_ids = list(tokenizer.vocab.values())
@@ -215,8 +218,8 @@ def test_random_vocab_chars(model: LibLlamaModel, tokenizer: PreTrainedTokenizer
215218
text = rand.choices(vocab_chars, k=1024)
216219
text = "".join(text)
217220

218-
ids1 = model.tokenize(text, parse_special=True)
219-
ids2 = tokenizer.encode(text)
221+
ids1 = model.tokenize(text, add_special=False, parse_special=False)
222+
ids2 = tokenizer.encode(text, add_special_tokens=False)
220223
assert(ids1 == ids2)
221224

222225

@@ -255,12 +258,12 @@ def test_random_vocab_tokens(model: LibLlamaModel, tokenizer: PreTrainedTokenize
255258
text.append("".join(tokens) + sep)
256259
text = "".join(text)
257260

258-
ids1 = model.tokenize(text, parse_special=True)
259-
ids2 = tokenizer.encode(text)
261+
ids1 = model.tokenize(text, add_special=False, parse_special=False)
262+
ids2 = tokenizer.encode(text, add_special_tokens=False)
260263
assert(ids1 == ids2)
261264

262265

263-
def test_random_bytes(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, iterations=100):
266+
def test_random_bytes(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, iterations = 100):
264267

265268
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
266269

@@ -280,8 +283,8 @@ def test_random_bytes(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase,
280283
text.append("".join(word))
281284
text = "".join(text)
282285

283-
ids1 = model.tokenize(text, parse_special=True)
284-
ids2 = tokenizer.encode(text)
286+
ids1 = model.tokenize(text, add_special=False, parse_special=False)
287+
ids2 = tokenizer.encode(text, add_special_tokens=False)
285288
assert(ids1 == ids2)
286289

287290

0 commit comments

Comments
 (0)