@@ -133,18 +133,20 @@ def make_tensors_list() -> List[str]:
133
133
@dataclass
134
134
class Params :
135
135
n_vocab : int
136
+ n_vocab_sp : int
136
137
n_embd : int
137
138
n_mult : int
138
139
n_head : int
139
140
n_layer : int
140
141
file_type : GGMLFileType
141
142
142
143
@staticmethod
143
- def guessed (model : 'LazyModel' , file_type : GGMLFileType ) -> 'Params' :
144
+ def guessed (model : 'LazyModel' , vocab : 'Vocab' , file_type : GGMLFileType ) -> 'Params' :
144
145
n_vocab , n_embd = model ["tok_embeddings.weight" ].shape
145
146
146
147
return Params (
147
148
n_vocab = n_vocab ,
149
+ n_vocab_sp = vocab .vocab_special_size ,
148
150
n_embd = n_embd ,
149
151
n_mult = 256 ,
150
152
n_head = n_embd // 128 ,
@@ -154,7 +156,7 @@ def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
154
156
155
157
156
158
class SentencePieceVocab :
157
- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
159
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], fname_special_tokens : Optional [ Path ] ) -> None :
158
160
self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
159
161
added_tokens : Dict [str , int ]
160
162
if fname_added_tokens is not None :
@@ -172,13 +174,24 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
172
174
self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
173
175
self .fname_tokenizer = fname_tokenizer
174
176
self .fname_added_tokens = fname_added_tokens
177
+ special_tokens : Dict [str , Dict [str , Any ]]
178
+ if fname_special_tokens is not None :
179
+ special_tokens = json .load (open (fname_special_tokens ))
180
+ else :
181
+ special_tokens = {}
182
+ token_name_to_id = {"unk_token" : self .sentencepiece_tokenizer .unk_id (), "bos_token" : self .sentencepiece_tokenizer .bos_id (), "eos_token" : self .sentencepiece_tokenizer .eos_id (), "pad_token" : self .sentencepiece_tokenizer .pad_id ()}
183
+ self .special_tokens_map = {token_name_to_id [token_name ]: info ["content" ] if isinstance (info , dict ) else info for token_name , info in special_tokens .items () if token_name in token_name_to_id and token_name_to_id [token_name ] != - 1 }
184
+ self .vocab_special_size : int = len (self .added_tokens_list ) + len (self .special_tokens_map )
175
185
176
186
def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
177
187
tokenizer = self .sentencepiece_tokenizer
188
+ special_tokens = [tokenizer .bos_id (), tokenizer .eos_id (), tokenizer .pad_id ()]
178
189
for i in range (tokenizer .vocab_size ()):
179
190
text : bytes
180
191
if tokenizer .is_unknown (i ):
181
- text = " \u2047 " .encode ("utf-8" )
192
+ text = self .special_tokens_map .get (i , " \u2047 " ).encode ("utf-8" )
193
+ elif i in special_tokens :
194
+ text = self .special_tokens_map .get (i , "" ).encode ("utf-8" )
182
195
elif tokenizer .is_control (i ):
183
196
text = b""
184
197
elif tokenizer .is_byte (i ):
@@ -201,18 +214,29 @@ def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
201
214
yield from self .sentencepiece_tokens ()
202
215
yield from self .added_tokens ()
203
216
217
+ def all_special_tokens (self ) -> Iterable [int ]:
218
+ for token_id in self .special_tokens_map .keys ():
219
+ yield token_id
220
+ for i in range (len (self .added_tokens_list )):
221
+ yield self .vocab_size_base + i
222
+
204
223
def __repr__ (self ) -> str :
205
224
return f"<SentencePieceVocab with { self .vocab_size_base } base tokens and { len (self .added_tokens_list )} added tokens>"
206
225
207
226
208
227
class GGMLVocab :
209
228
def __init__ (self , tokens : List [Tuple [bytes , float ]]):
210
229
self .tokens = tokens
230
+ self .special_tokens = []
211
231
self .vocab_size = len (tokens )
232
+ self .vocab_special_size = 0
212
233
213
234
def all_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
214
235
return self .tokens
215
236
237
+ def all_special_tokens (self ) -> Iterable [int ]:
238
+ return self .special_tokens
239
+
216
240
def __repr__ (self ) -> str :
217
241
return f"<GGMLVocab with { self .vocab_size } tokens>"
218
242
@@ -923,8 +947,9 @@ def __init__(self, fname_out: Path) -> None:
923
947
def write_file_header (self , params : Params ) -> None :
924
948
self .fout .write (b"ggjt" [::- 1 ]) # magic
925
949
values = [
926
- 1 , # file version
950
+ 4 , # file version
927
951
params .n_vocab ,
952
+ params .n_vocab_sp ,
928
953
params .n_embd ,
929
954
params .n_mult ,
930
955
params .n_head ,
@@ -946,11 +971,13 @@ def write_vocab(self, vocab: Vocab) -> None:
946
971
self .fout .write (struct .pack ("i" , len (text )))
947
972
self .fout .write (text )
948
973
self .fout .write (struct .pack ("f" , score ))
974
+ for token_id in vocab .all_special_tokens ():
975
+ self .fout .write (struct .pack ("i" , token_id ))
949
976
950
977
@staticmethod
951
978
def write_vocab_only (fname_out : Path , vocab : Vocab ) -> None :
952
979
of = OutputFile (fname_out )
953
- params = Params (n_vocab = vocab .vocab_size , n_embd = 0 , n_mult = 0 ,
980
+ params = Params (n_vocab = vocab .vocab_size , n_vocab_sp = vocab . vocab_special_size , n_embd = 0 , n_mult = 0 ,
954
981
n_head = 1 , n_layer = 0 , file_type = GGMLFileType .AllF32 )
955
982
of = OutputFile (fname_out )
956
983
of .write_file_header (params )
@@ -1103,8 +1130,10 @@ def load_vocab(path: Path) -> SentencePieceVocab:
1103
1130
f"Could not find tokenizer.model in { path } or its parent; "
1104
1131
"if it's in another directory, pass the directory as --vocab-dir" )
1105
1132
added_tokens_path = path .parent / "added_tokens.json"
1133
+ special_tokens_path = path .parent / "special_tokens_map.json"
1134
+ tokenizer_config_path = path .parent / "tokenizer_config.json"
1106
1135
print (f"Loading vocab file { path } " )
1107
- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None )
1136
+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None , special_tokens_path if special_tokens_path . exists () else tokenizer_config_path if tokenizer_config_path . exists () else None )
1108
1137
1109
1138
1110
1139
def default_outfile (model_paths : List [Path ], params : Params ) -> Path :
@@ -1168,7 +1197,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
1168
1197
model = do_necessary_conversions (model )
1169
1198
output_type = pick_output_type (model , args .outtype )
1170
1199
model = convert_to_output_type (model , output_type )
1171
- params = Params .guessed (model , output_type )
1200
+ params = Params .guessed (model , vocab , output_type )
1172
1201
outfile = args .outfile or default_outfile (model_plus .paths , params )
1173
1202
OutputFile .write_all (outfile , params , model , vocab )
1174
1203
print (f"Wrote { outfile } " )
0 commit comments