From 2f2841bfcef68c2d06fe2b80bd79dabfd3351962 Mon Sep 17 00:00:00 2001 From: Karthick Date: Tue, 17 Dec 2024 22:37:08 +0530 Subject: [PATCH] whisper : add single-timestamp logic (#2629) * Fix hallucinations during silence When the predicted tokens end with a single timestamp the the entire 30 segment should be considered as done, to avoid hallucinations for the remaining part of segment. This behaviour is on par with openai's whisper. Refer to logic related to `single_timestamp_ending` in https://github.com/openai/whisper/blob/main/whisper/transcribe.py * Accept review comments related to formatting. Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- src/whisper.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index ddeecc5..810a8d2 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6060,7 +6060,7 @@ int whisper_full_with_state( { const auto & best_decoder = state->decoders[best_decoder_id]; - const auto seek_delta = best_decoder.seek_delta; + auto seek_delta = best_decoder.seek_delta; const auto result_len = best_decoder.sequence.result_len; const auto & tokens_cur = best_decoder.sequence.tokens; @@ -6201,6 +6201,15 @@ int whisper_full_with_state( } } + // ref: https://github.com/ggerganov/whisper.cpp/pull/2629 + const bool single_timestamp_ending = tokens_cur.size() > 1 && + tokens_cur[tokens_cur.size() - 2].id < whisper_token_beg(ctx) && + tokens_cur[tokens_cur.size() - 1].id > whisper_token_beg(ctx); + if (single_timestamp_ending) { + WHISPER_LOG_DEBUG("single timestamp ending - skip entire chunk\n"); + seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100); + } + // update audio window seek += seek_delta;