@@ -156,8 +156,9 @@ def guessed(model: 'LazyModel', vocab: 'Vocab', file_type: GGMLFileType) -> 'Par
156
156
157
157
158
158
class SentencePieceVocab :
159
- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], fname_special_tokens : Optional [Path ]) -> None :
159
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], fname_special_tokens : Optional [Path ], fname_tokenizer_config : Optional [ Path ] ) -> None :
160
160
self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
161
+
161
162
added_tokens : Dict [str , int ]
162
163
if fname_added_tokens is not None :
163
164
added_tokens = json .load (open (fname_added_tokens ))
@@ -174,13 +175,40 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], fn
174
175
self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
175
176
self .fname_tokenizer = fname_tokenizer
176
177
self .fname_added_tokens = fname_added_tokens
177
- special_tokens : Dict [str , Dict [str , Any ]]
178
+ self .special_tokens_map : Dict [int , str ] = {}
179
+
180
+ TOKEN_NAME_TO_ID : Dict [str , int ] = {
181
+ "unk_token" : self .sentencepiece_tokenizer .unk_id (),
182
+ "bos_token" : self .sentencepiece_tokenizer .bos_id (),
183
+ "eos_token" : self .sentencepiece_tokenizer .eos_id (),
184
+ "pad_token" : self .sentencepiece_tokenizer .pad_id ()
185
+ }
186
+
187
+ tokenizer_config : Dict [str , Any ]
188
+ if fname_tokenizer_config is not None :
189
+ tokenizer_config = json .load (open (fname_tokenizer_config ))
190
+ else :
191
+ tokenizer_config = {}
192
+ for key , value in tokenizer_config .items ():
193
+ assert isinstance (value , dict ) or isinstance (value , str )
194
+ if key not in TOKEN_NAME_TO_ID or TOKEN_NAME_TO_ID [key ] == - 1 :
195
+ continue
196
+ self .special_tokens_map [TOKEN_NAME_TO_ID [key ]] = value ["content" ] if isinstance (value , dict ) else value
197
+
198
+ special_tokens : Dict [str , Any ]
178
199
if fname_special_tokens is not None :
179
200
special_tokens = json .load (open (fname_special_tokens ))
180
201
else :
181
202
special_tokens = {}
182
- token_name_to_id = {"unk_token" : self .sentencepiece_tokenizer .unk_id (), "bos_token" : self .sentencepiece_tokenizer .bos_id (), "eos_token" : self .sentencepiece_tokenizer .eos_id (), "pad_token" : self .sentencepiece_tokenizer .pad_id ()}
183
- self .special_tokens_map = {token_name_to_id [token_name ]: info ["content" ] if isinstance (info , dict ) else info for token_name , info in special_tokens .items () if token_name in token_name_to_id and token_name_to_id [token_name ] != - 1 }
203
+ for key , value in special_tokens .items ():
204
+ assert isinstance (value , dict ) or isinstance (value , str )
205
+ if key not in TOKEN_NAME_TO_ID :
206
+ continue
207
+ token_id = TOKEN_NAME_TO_ID [key ]
208
+ if token_id == - 1 or token_id in self .special_tokens_map :
209
+ continue
210
+ self .special_tokens_map [token_id ] = value ["content" ] if isinstance (value , dict ) else value
211
+
184
212
self .vocab_special_size : int = len (self .added_tokens_list ) + len (self .special_tokens_map )
185
213
186
214
def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
@@ -1133,7 +1161,7 @@ def load_vocab(path: Path) -> SentencePieceVocab:
1133
1161
special_tokens_path = path .parent / "special_tokens_map.json"
1134
1162
tokenizer_config_path = path .parent / "tokenizer_config.json"
1135
1163
print (f"Loading vocab file { path } " )
1136
- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None , special_tokens_path if special_tokens_path .exists () else tokenizer_config_path if tokenizer_config_path .exists () else None )
1164
+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None , special_tokens_path if special_tokens_path .exists () else None , tokenizer_config_path if tokenizer_config_path .exists () else None )
1137
1165
1138
1166
1139
1167
def default_outfile (model_paths : List [Path ], params : Params ) -> Path :
0 commit comments