@@ -380,16 +380,17 @@ struct whisper_vocab {
380
380
std::map<token, id> token_to_id;
381
381
std::map<id, token> id_to_token;
382
382
383
- id token_eot = 50256 ;
384
- id token_sot = 50257 ;
385
- id token_solm = 50359 ; // ?? TODO@Akash - rename appropriately
386
- id token_prev = 50360 ;
387
- id token_not = 50362 ; // no timestamps
388
- id token_beg = 50363 ; // begin timestamps
389
-
390
- // available tasks
391
- static const id token_translate = 50358 ; // TODO@Akash - technically it's 50357 for .en models
392
- static const id token_transcribe = 50359 ; // TODO@Akash - technically it's 50358 for .en models
383
+ // reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
384
+ id token_eot = 50256 ;
385
+ id token_sot = 50257 ;
386
+ // task tokens (used only for multilingual models)
387
+ id token_translate = 50357 ;
388
+ id token_transcribe = 50358 ;
389
+ // other special tokens
390
+ id token_solm = 50359 ; // ?? TODO@Akash - rename appropriately
391
+ id token_prev = 50360 ;
392
+ id token_not = 50362 ; // no timestamps
393
+ id token_beg = 50363 ; // begin timestamps
393
394
394
395
bool is_multilingual () const {
395
396
return n_vocab == 51865 ;
@@ -966,8 +967,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
966
967
if (vocab.is_multilingual ()) {
967
968
vocab.token_eot ++;
968
969
vocab.token_sot ++;
969
- vocab.token_prev ++;
970
+ vocab.token_translate ++;
971
+ vocab.token_transcribe ++;
970
972
vocab.token_solm ++;
973
+ vocab.token_prev ++;
971
974
vocab.token_not ++;
972
975
vocab.token_beg ++;
973
976
}
@@ -3228,12 +3231,12 @@ whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
3228
3231
return whisper_token_sot (ctx) + 1 + lang_id;
3229
3232
}
3230
3233
3231
- whisper_token whisper_token_translate (void ) {
3232
- return whisper_vocab:: token_translate;
3234
+ whisper_token whisper_token_translate (struct whisper_context * ctx ) {
3235
+ return ctx-> vocab . token_translate ;
3233
3236
}
3234
3237
3235
- whisper_token whisper_token_transcribe (void ) {
3236
- return whisper_vocab:: token_transcribe;
3238
+ whisper_token whisper_token_transcribe (struct whisper_context * ctx ) {
3239
+ return ctx-> vocab . token_transcribe ;
3237
3240
}
3238
3241
3239
3242
void whisper_print_timings (struct whisper_context * ctx) {
@@ -4018,9 +4021,9 @@ int whisper_full_with_state(
4018
4021
state->lang_id = lang_id;
4019
4022
prompt_init.push_back (whisper_token_lang (ctx, lang_id));
4020
4023
if (params.translate ) {
4021
- prompt_init.push_back (whisper_token_translate ());
4024
+ prompt_init.push_back (whisper_token_translate (ctx ));
4022
4025
} else {
4023
- prompt_init.push_back (whisper_token_transcribe ());
4026
+ prompt_init.push_back (whisper_token_transcribe (ctx ));
4024
4027
}
4025
4028
}
4026
4029
0 commit comments