Skip to content

Commit fce48ca

Browse files
ftgreatldwang
andauthored
convert.py : support bpe tokenizer (#2228)
* support bpe tokenizer in convert Signed-off-by: ldwang <[email protected]> * support bpe tokenizer in convert Signed-off-by: ldwang <[email protected]> * support bpe tokenizer in convert, fix Signed-off-by: ldwang <[email protected]> --------- Signed-off-by: ldwang <[email protected]> Co-authored-by: ldwang <[email protected]>
1 parent 875086b commit fce48ca

File tree

1 file changed

+46
-23
lines changed

1 file changed

+46
-23
lines changed

convert.py

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -234,14 +234,21 @@ def load(model_plus: 'ModelPlus') -> 'Params':
234234

235235

236236
class SentencePieceVocab:
237-
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
238-
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
237+
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
238+
self.vocabtype = vocabtype
239+
if self.vocabtype == "bpe":
240+
self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
241+
else:
242+
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
239243
added_tokens: Dict[str, int]
240244
if fname_added_tokens is not None:
241245
added_tokens = json.load(open(fname_added_tokens))
242246
else:
243247
added_tokens = {}
244-
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
248+
if self.vocabtype == "bpe":
249+
vocab_size: int = len(self.sentencepiece_tokenizer)
250+
else:
251+
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
245252
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
246253
actual_ids = sorted(added_tokens.values())
247254
if expected_ids != actual_ids:
@@ -255,22 +262,32 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
255262

256263
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
257264
tokenizer = self.sentencepiece_tokenizer
258-
for i in range(tokenizer.vocab_size()):
265+
if self.vocabtype == "bpe":
266+
from transformers.models.gpt2 import tokenization_gpt2
267+
byte_encoder = tokenization_gpt2.bytes_to_unicode()
268+
byte_decoder = {v: k for k, v in byte_encoder.items()}
269+
for i, item in enumerate(tokenizer):
259270
text: bytes
260-
if tokenizer.is_unknown(i):
261-
text = " \u2047 ".encode("utf-8")
262-
elif tokenizer.is_control(i):
263-
text = b""
264-
elif tokenizer.is_byte(i):
265-
piece = tokenizer.id_to_piece(i)
266-
if len(piece) != 6:
267-
raise Exception(f"Invalid token: {piece}")
268-
byte_value = int(piece[3:-1], 16)
269-
text = struct.pack("B", byte_value)
270-
else:
271-
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
272-
score: float = tokenizer.get_score(i)
271+
text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
272+
score: float = -i
273273
yield text, score
274+
else:
275+
for i in range(tokenizer.vocab_size()):
276+
text: bytes
277+
if tokenizer.is_unknown(i):
278+
text = " \u2047 ".encode("utf-8")
279+
elif tokenizer.is_control(i):
280+
text = b""
281+
elif tokenizer.is_byte(i):
282+
piece = tokenizer.id_to_piece(i)
283+
if len(piece) != 6:
284+
raise Exception(f"Invalid token: {piece}")
285+
byte_value = int(piece[3:-1], 16)
286+
text = struct.pack("B", byte_value)
287+
else:
288+
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
289+
score: float = tokenizer.get_score(i)
290+
yield text, score
274291

275292
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
276293
for text in self.added_tokens_list:
@@ -1196,14 +1213,18 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
11961213
return {name: model[name] for name in TENSORS_LIST if name in model}
11971214

11981215

1199-
def load_vocab(path: Path) -> SentencePieceVocab:
1216+
def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
1217+
print(f"vocabtype: {vocabtype}")
12001218
# Be extra-friendly and accept either a file or a directory. Also, if it's
12011219
# a directory, it might be the model directory, and tokenizer.model might
12021220
# be in the parent of that.
12031221
if path.is_dir():
1204-
path2 = path / "tokenizer.model"
1222+
vocab_file = "tokenizer.model"
1223+
if vocabtype == 'bpe':
1224+
vocab_file = "vocab.json"
1225+
path2 = path / vocab_file
12051226
# Use `.parent` instead of /.. to handle the symlink case better.
1206-
path3 = path.parent / "tokenizer.model"
1227+
path3 = path.parent / vocab_file
12071228
if path2.exists():
12081229
path = path2
12091230
elif path3.exists():
@@ -1214,7 +1235,8 @@ def load_vocab(path: Path) -> SentencePieceVocab:
12141235
"if it's in another directory, pass the directory as --vocab-dir")
12151236
added_tokens_path = path.parent / "added_tokens.json"
12161237
print(f"Loading vocab file {path}")
1217-
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1238+
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
1239+
vocabtype)
12181240

12191241

12201242
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
@@ -1252,14 +1274,15 @@ def main(args_in: Optional[List[str]] = None) -> None:
12521274
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
12531275
parser.add_argument("model", type=Path,
12541276
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1277+
parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
12551278
args = parser.parse_args(args_in)
12561279

12571280
vocab: Vocab
12581281
if args.dump_single:
12591282
model_plus = lazy_load_file(args.model)
12601283
do_dump_model(model_plus)
12611284
elif args.vocab_only:
1262-
vocab = load_vocab(args.vocab_dir or args.model)
1285+
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
12631286
assert args.outfile, "need --outfile if using --vocab-only"
12641287
outfile = args.outfile
12651288
OutputFile.write_vocab_only(outfile, vocab)
@@ -1273,7 +1296,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
12731296
vocab = model_plus.vocab
12741297
else:
12751298
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
1276-
vocab = load_vocab(vocab_dir)
1299+
vocab = load_vocab(vocab_dir, args.vocabtype)
12771300
params = Params.load(model_plus)
12781301
model = model_plus.model
12791302
model = do_necessary_conversions(model, params)

0 commit comments

Comments
 (0)