-
Notifications
You must be signed in to change notification settings - Fork 5k
Significantly improve whisper.cpp inference quality #1148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
4767223
38eaeff
2dd6884
4ebe450
6f445d1
7f690dd
527d7c6
f3e7774
95be6dc
bd1dbd1
2c49c9b
5df242c
e40ec27
715bf61
3fe41d5
36b0df7
444b59a
308f490
252f807
0a5f435
65fd0e1
386ef32
241df86
22d348c
590a12e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2396,12 +2396,10 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) { | |
| even.reserve(N/2); | ||
| odd.reserve(N/2); | ||
|
|
||
| for (int i = 0; i < N; i++) { | ||
| if (i % 2 == 0) { | ||
| even.push_back(in[i]); | ||
| } else { | ||
| odd.push_back(in[i]); | ||
| } | ||
| // | ||
| for (int i = 0; i < N; i+=2) { | ||
| even.push_back(in[i]); | ||
| odd.push_back(in[i + 1]); | ||
| } | ||
|
|
||
| std::vector<float> even_fft; | ||
|
|
@@ -2424,40 +2422,45 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) { | |
|
|
||
| out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd; | ||
| out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd; | ||
|
|
||
| } | ||
| } | ||
|
|
||
| static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> &hann, const float *samples, | ||
| int n_samples, int fft_size, int fft_step, int n_threads, | ||
| const whisper_filters &filters, bool speed_up, whisper_mel &mel) { | ||
| std::vector<float> fft_in(fft_size, 0.0); | ||
| std::vector<float> fft_out(2 * fft_size); | ||
| int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2); | ||
|
|
||
| for (int i = ith; i < mel.n_len; i += n_threads) { | ||
| std::vector<float> fft_in(fft_size, 0.0); | ||
| std::vector<float> fft_out(2 * fft_size); | ||
| const int offset = i * fft_step; | ||
|
|
||
| // apply Hanning window | ||
| for (int j = 0; j < fft_size; j++) { | ||
| if (offset + j < n_samples) { | ||
| fft_in[j] = hann[j] * samples[offset + j]; | ||
| } else { | ||
| fft_in[j] = 0.0; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| // FFT -> mag^2 | ||
| fft(fft_in, fft_out); | ||
|
|
||
| // Calculate modulus of complex numbers | ||
| for (int j = 0; j < fft_size; j++) { | ||
| fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); | ||
| } | ||
| for (int j = 1; j < fft_size / 2; j++) { | ||
| fft_out[j] += fft_out[fft_size - j]; | ||
|
|
||
| // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. | ||
| // This is where the actual issue lies | ||
| for (int j = 0; j < fft_size / 2; j++) { | ||
| fft_out[j] = (fft_out[fft_size - j - 1] + fft_out[j + 1]) / 2; | ||
| } | ||
|
||
|
|
||
| if (speed_up) { | ||
| // scale down in the frequency domain results in a speed up in the time domain | ||
| // scale down in the frequency domain results in a speed-up in the time domain | ||
| for (int j = 0; j < n_fft; j++) { | ||
| fft_out[j] = 0.5 * (fft_out[2 * j] + fft_out[2 * j + 1]); | ||
| } | ||
|
|
@@ -2471,10 +2474,10 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> | |
| int k = 0; | ||
| for (k = 0; k < n_fft - 3; k += 4) { | ||
| sum += | ||
| fft_out[k + 0] * filters.data[j*n_fft + k + 0] + | ||
| fft_out[k + 1] * filters.data[j*n_fft + k + 1] + | ||
| fft_out[k + 2] * filters.data[j*n_fft + k + 2] + | ||
| fft_out[k + 3] * filters.data[j*n_fft + k + 3]; | ||
| fft_out[k + 0] * filters.data[j * n_fft + k + 0] + | ||
| fft_out[k + 1] * filters.data[j * n_fft + k + 1] + | ||
| fft_out[k + 2] * filters.data[j * n_fft + k + 2] + | ||
| fft_out[k + 3] * filters.data[j * n_fft + k + 3]; | ||
| } | ||
|
|
||
| // handle n_fft remainder | ||
|
|
@@ -3634,7 +3637,7 @@ static void whisper_process_logits( | |
| WHISPER_ASSERT(n_logits == ctx.vocab.n_vocab); | ||
|
|
||
| // extract the logits for the last token | ||
| // we will be mutating and therefore we don't want to use the ctx.logits buffer directly | ||
| // we will be mutating, and therefore we don't want to use the ctx.logits buffer directly | ||
| auto & probs = decoder.probs; | ||
| auto & logits = decoder.logits; | ||
| auto & logprobs = decoder.logprobs; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the benefit of moving these vectors inside the loop?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Firstly, movingstd::vector<float> fft_in(fft_size, 0.0);inside the loop can take advantage of the built-in padding feature Fastest way to reset every value of std::vector to 0 when assigning vectors, which theoretically should be faster than manually using a for loop. This section of code could theoretically be optimized further by removing the if statement. The reason for movingstd::vector<float> fft_out(2 * fft_size);is that during debugging, I found that fft_out would continuously grow in length after the FFT computation, far exceeding the theoretical length of 800.Sorry... I've conducted further testing, and it appears that my initial reasons for moving them into the loop were not correct.
This comment was marked as off-topic.
Sorry, something went wrong.
Uh oh!
There was an error while loading. Please reload this page.