Skip to content

Commit a0ec3fa

Browse files
Server : Add support for .vtt format to Whisper server (#1578)
- The code comes from examples/main - The output mimetype is set to text/vtt Example usage: ```shell curl 127.0.0.1:8080/inference \ -H "Content-Type: multipart/form-data" \ -F file="@samples/jfk.wav" \ -F temperature="0.2" \ -F response-format="vtt" ```
1 parent 6559b53 commit a0ec3fa

File tree

1 file changed

+23
-0
lines changed

1 file changed

+23
-0
lines changed

examples/server/server.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,29 @@ int main(int argc, char ** argv) {
678678
ss << speaker << text << "\n\n";
679679
}
680680
res.set_content(ss.str(), "application/x-subrip");
681+
} else if (params.response_format == vtt_format) {
682+
std::stringstream ss;
683+
684+
ss << "WEBVTT\n\n";
685+
686+
const int n_segments = whisper_full_n_segments(ctx);
687+
for (int i = 0; i < n_segments; ++i) {
688+
const char * text = whisper_full_get_segment_text(ctx, i);
689+
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
690+
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
691+
std::string speaker = "";
692+
693+
if (params.diarize && pcmf32s.size() == 2)
694+
{
695+
speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
696+
speaker.insert(0, "<v Speaker");
697+
speaker.append(">");
698+
}
699+
700+
ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
701+
ss << speaker << text << "\n\n";
702+
}
703+
res.set_content(ss.str(), "text/vtt");
681704
}
682705
// TODO add more output formats
683706
else

0 commit comments

Comments
 (0)