From a0ec3fac5492be6f89e1010e0108e6f89cb81da0 Mon Sep 17 00:00:00 2001 From: Aleksander Andrzejewski <18704749+aleksanderandrzejewski@users.noreply.github.com> Date: Fri, 1 Dec 2023 00:44:26 +0100 Subject: [PATCH] Server : Add support for .vtt format to Whisper server (#1578) - The code comes from examples/main - The output mimetype is set to text/vtt Example usage: ```shell curl 127.0.0.1:8080/inference \ -H "Content-Type: multipart/form-data" \ -F file="@samples/jfk.wav" \ -F temperature="0.2" \ -F response-format="vtt" ``` --- examples/server/server.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2d15d4c..96f8608 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -678,6 +678,29 @@ int main(int argc, char ** argv) { ss << speaker << text << "\n\n"; } res.set_content(ss.str(), "application/x-subrip"); + } else if (params.response_format == vtt_format) { + std::stringstream ss; + + ss << "WEBVTT\n\n"; + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char * text = whisper_full_get_segment_text(ctx, i); + const int64_t t0 = whisper_full_get_segment_t0(ctx, i); + const int64_t t1 = whisper_full_get_segment_t1(ctx, i); + std::string speaker = ""; + + if (params.diarize && pcmf32s.size() == 2) + { + speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true); + speaker.insert(0, ""); + } + + ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n"; + ss << speaker << text << "\n\n"; + } + res.set_content(ss.str(), "text/vtt"); } // TODO add more output formats else