@@ -96,7 +96,9 @@ def find_first_mismatch(ids1: list[int], ids2: list[int]):
96
96
for i , (a ,b ) in enumerate (zip (ids1 , ids2 )):
97
97
if a != b :
98
98
return i
99
- return - 1 if len (ids1 ) == len (ids2 ) else i
99
+ if len (ids1 ) == len (ids2 ):
100
+ return - 1
101
+ return min (len (ids1 ), len (ids2 ))
100
102
101
103
102
104
def test_custom_texts (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase ):
@@ -152,11 +154,12 @@ def test_custom_texts(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase):
152
154
'a 〇b' , # unicode_ranges_digit, 0x3007
153
155
'Ⅵ-a' , # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
154
156
'\uFEFF //' , # unicode_ranges_control, 0xFEFF (BOM)
157
+ '<s>a' # TODO: Phi-3 fail
155
158
]
156
159
157
160
for text in tests + more_tests :
158
- ids1 = model .tokenize (text , parse_special = True )
159
- ids2 = tokenizer .encode (text )
161
+ ids1 = model .tokenize (text , add_special = False , parse_special = False )
162
+ ids2 = tokenizer .encode (text , add_special_tokens = False )
160
163
logger .info (repr (text ))
161
164
if ids1 != ids2 :
162
165
logger .info (" TokenIDs: " + str (list (ids1 )))
@@ -165,7 +168,7 @@ def test_custom_texts(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase):
165
168
raise Exception ()
166
169
167
170
168
- def test_random_chars (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
171
+ def test_random_chars (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
169
172
170
173
WHITESPACES = list (" " * 20 + "\n " * 5 + "\r \n " * 5 + "\t " * 5 )
171
174
CHARS = list (set ("""
@@ -192,12 +195,12 @@ def test_random_chars(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase,
192
195
text .append ("" .join (word ) + space )
193
196
text = "" .join (text )
194
197
195
- ids1 = model .tokenize (text , parse_special = True )
196
- ids2 = tokenizer .encode (text )
198
+ ids1 = model .tokenize (text , add_special = False , parse_special = False )
199
+ ids2 = tokenizer .encode (text , add_special_tokens = False )
197
200
assert (ids1 == ids2 )
198
201
199
202
200
- def test_random_vocab_chars (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
203
+ def test_random_vocab_chars (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
201
204
202
205
logger .info ("Building vocab char list ..." )
203
206
vocab_ids = list (tokenizer .vocab .values ())
@@ -215,8 +218,8 @@ def test_random_vocab_chars(model: LibLlamaModel, tokenizer: PreTrainedTokenizer
215
218
text = rand .choices (vocab_chars , k = 1024 )
216
219
text = "" .join (text )
217
220
218
- ids1 = model .tokenize (text , parse_special = True )
219
- ids2 = tokenizer .encode (text )
221
+ ids1 = model .tokenize (text , add_special = False , parse_special = False )
222
+ ids2 = tokenizer .encode (text , add_special_tokens = False )
220
223
assert (ids1 == ids2 )
221
224
222
225
@@ -255,12 +258,12 @@ def test_random_vocab_tokens(model: LibLlamaModel, tokenizer: PreTrainedTokenize
255
258
text .append ("" .join (tokens ) + sep )
256
259
text = "" .join (text )
257
260
258
- ids1 = model .tokenize (text , parse_special = True )
259
- ids2 = tokenizer .encode (text )
261
+ ids1 = model .tokenize (text , add_special = False , parse_special = False )
262
+ ids2 = tokenizer .encode (text , add_special_tokens = False )
260
263
assert (ids1 == ids2 )
261
264
262
265
263
- def test_random_bytes (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
266
+ def test_random_bytes (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
264
267
265
268
WHITESPACES = list (" " * 20 + "\n " * 5 + "\r \n " * 5 + "\t " * 5 )
266
269
@@ -280,8 +283,8 @@ def test_random_bytes(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase,
280
283
text .append ("" .join (word ))
281
284
text = "" .join (text )
282
285
283
- ids1 = model .tokenize (text , parse_special = True )
284
- ids2 = tokenizer .encode (text )
286
+ ids1 = model .tokenize (text , add_special = False , parse_special = False )
287
+ ids2 = tokenizer .encode (text , add_special_tokens = False )
285
288
assert (ids1 == ids2 )
286
289
287
290
0 commit comments