Skip to content

Commit eb8b326

Browse files
committed
tests : add test-tokenizer-1.py
1 parent e4324cb commit eb8b326

File tree

2 files changed

+90
-0
lines changed

2 files changed

+90
-0
lines changed

tests/test-tokenizer-0.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# tests with SPM tokenizer
2+
13
import os
24
import sys
35
import argparse
@@ -70,6 +72,11 @@
7072
print("%7d," % x, end='')
7173
print(" }, },")
7274

75+
print(tokenizer.encode('hello'))
76+
print(tokenizer.encode('world'))
77+
print(tokenizer.encode(' world'))
78+
print(tokenizer.encode('hello world'))
79+
7380
fname_tok = args.fname_tok
7481
if fname_tok:
7582
print('tokenizing file: ', fname_tok)

tests/test-tokenizer-1.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# tests with BPE tokenizer
2+
3+
import os
4+
import sys
5+
import argparse
6+
7+
from transformers import AutoTokenizer
8+
9+
parser = argparse.ArgumentParser()
10+
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
11+
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
12+
args = parser.parse_args()
13+
14+
dir_tokenizer = args.dir_tokenizer
15+
16+
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
17+
18+
tests = [
19+
"",
20+
" ",
21+
" ",
22+
" ",
23+
"\t",
24+
"\n",
25+
"\t\n",
26+
"Hello world",
27+
" Hello world",
28+
"Hello World",
29+
" Hello World",
30+
" Hello World!",
31+
"Hello, world!",
32+
" Hello, world!",
33+
" this is 🦙.cpp",
34+
"w048 7tuijk dsdfhu",
35+
"нещо на Български",
36+
"កាន់តែពិសេសអាចខលចេញ",
37+
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
38+
"Hello",
39+
" Hello",
40+
" Hello",
41+
" Hello",
42+
" Hello",
43+
" Hello\n Hello",
44+
]
45+
46+
for text in tests:
47+
print('text: ', text)
48+
print(tokenizer.encode(text))
49+
print(tokenizer.decode(tokenizer.encode(text)))
50+
51+
print("\n\ntests for C++:\n")
52+
for text in tests:
53+
res = tokenizer.encode(text)
54+
55+
k = text.replace('\n', '\\n')
56+
k = k.replace('\t', '\\t')
57+
k = '"' + k + '"'
58+
print("{ %-24s, { " % k, end='')
59+
for x in res:
60+
print("%7d," % x, end='')
61+
print(" }, },")
62+
63+
print(tokenizer.encode('hello'))
64+
print(tokenizer.encode('world'))
65+
print(tokenizer.encode(' world'))
66+
print(tokenizer.encode('hello world'))
67+
68+
fname_tok = args.fname_tok
69+
if fname_tok:
70+
print('tokenizing file: ', fname_tok)
71+
fname_out = fname_tok + '.tok'
72+
with open(fname_tok, 'r') as f:
73+
lines = f.readlines()
74+
s = ''.join(lines)
75+
res = tokenizer.encode(s)
76+
# write to file
77+
with open(fname_out, 'w') as f:
78+
for x in res:
79+
f.write(str(x) + ' ')
80+
f.write('\n')
81+
print('len(res): ', len(res))
82+
print('len(lines): ', len(lines))
83+
print('results written to: ', fname_out)

0 commit comments

Comments
 (0)