Skip to content

Commit 59ce853

Browse files
committed
test-tokenizer-random : reduce potential confilcts with #8379
* test-tokenizer-random : add a failing edge case for falcon
1 parent 1caa20f commit 59ce853

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

tests/test-tokenizer-random.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
232232
'a\na', # bert fail
233233
'"`', # falcon
234234
' \u2e4e', # falcon
235+
'\n\x0b ', # falcon
235236
'a\xa0\xa0\x00b', # jina-v2-es
236237
'one <mask>', # jina-v2-es <mask> lstrip=true
237238
'a </s> b', # rstrip phi-3
@@ -458,8 +459,8 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
458459
i = find_first_mismatch(ids1, ids2)
459460
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
460461
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
461-
logger.error(" Expected: " + str(ids1) + f" {[tokenizer1.decode([id]) for id in ids1]}")
462-
logger.error(" Result: " + str(ids2) + f" {[tokenizer2.decode([id]) for id in ids2]}")
462+
logger.error(" Expected: " + str(ids1))
463+
logger.error(" Result: " + str(ids2))
463464
encode_errors += 1
464465
logger.error(f" {encode_errors=}")
465466
if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):

0 commit comments

Comments
 (0)