@@ -382,14 +382,14 @@ struct whisper_vocab {
382
382
383
383
id token_eot = 50256 ;
384
384
id token_sot = 50257 ;
385
+ id token_solm = 50359 ; // ?? TODO@Akash - rename appropriately
385
386
id token_prev = 50360 ;
386
- id token_solm = 50361 ; // ??
387
387
id token_not = 50362 ; // no timestamps
388
- id token_beg = 50363 ;
388
+ id token_beg = 50363 ; // begin timestamps
389
389
390
390
// available tasks
391
- static const id token_translate = 50358 ;
392
- static const id token_transcribe = 50359 ;
391
+ static const id token_translate = 50358 ; // TODO@Akash - technically it's 50357 for .en models
392
+ static const id token_transcribe = 50359 ; // TODO@Akash - technically it's 50358 for .en models
393
393
394
394
bool is_multilingual () const {
395
395
return n_vocab == 51865 ;
@@ -3521,7 +3521,7 @@ static void whisper_process_logits(
3521
3521
3522
3522
// suppress sot and solm tokens
3523
3523
logits[vocab.token_sot ] = -INFINITY;
3524
- logits[vocab.token_solm ] = -INFINITY;
3524
+ // logits[vocab.token_solm] = -INFINITY;
3525
3525
3526
3526
// suppress task tokens
3527
3527
logits[vocab.token_translate ] = -INFINITY;
@@ -4500,7 +4500,6 @@ int whisper_full_with_state(
4500
4500
prompt_past.push_back (tokens_cur[i].id );
4501
4501
}
4502
4502
4503
- // store the text from this iteration
4504
4503
if (!tokens_cur.empty () && ctx->model .n_loaded > 0 ) {
4505
4504
int i0 = 0 ;
4506
4505
auto t0 = seek + 2 *(tokens_cur.front ().tid - whisper_token_beg (ctx));
@@ -4517,6 +4516,10 @@ int whisper_full_with_state(
4517
4516
text += whisper_token_to_str (ctx, tokens_cur[i].id );
4518
4517
}
4519
4518
4519
+ if (tokens_cur[i].id == whisper_token_solm (ctx)){
4520
+ text += " [SPEAKER TURN]" ;
4521
+ };
4522
+
4520
4523
if (tokens_cur[i].id > whisper_token_beg (ctx) && !params.single_segment ) {
4521
4524
const auto t1 = seek + 2 *(tokens_cur[i].tid - whisper_token_beg (ctx));
4522
4525
0 commit comments