sync : latest changes from whisper.cpp

ggerganov · ggerganov · commit 45fc4fed0b9f · 2023-01-07T09:39:12.000+02:00
diff --git a/examples/whisper/main.cpp b/examples/whisper/main.cpp
@@ -176,90 +176,81 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
 
     const int n_segments = whisper_full_n_segments(ctx);
 
+    std::string speaker = "";
+
+    int64_t t0;
+    int64_t t1;
+
     // print the last n_new segments
     const int s0 = n_segments - n_new;
+
     if (s0 == 0) {
         printf("\n");
     }
 
     for (int i = s0; i < n_segments; i++) {
-        if (params.no_timestamps) {
-            if (params.print_colors) {
-                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special == false) {
-                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                        if (id >= whisper_token_eot(ctx)) {
-                            continue;
-                        }
-                    }
-
-                    const char * text = whisper_full_get_token_text(ctx, i, j);
-                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
+        if (!params.no_timestamps || params.diarize) {
+            t0 = whisper_full_get_segment_t0(ctx, i);
+            t1 = whisper_full_get_segment_t1(ctx, i);
+        }
 
-                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+        if (!params.no_timestamps) {
+            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
+        }
 
-                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
-                }
-            } else {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                printf("%s", text);
-            }
-            fflush(stdout);
-        } else {
-            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+        if (params.diarize && pcmf32s.size() == 2) {
+            const int64_t n_samples = pcmf32s[0].size();
 
-            std::string speaker;
+            const int64_t is0 = timestamp_to_sample(t0, n_samples);
+            const int64_t is1 = timestamp_to_sample(t1, n_samples);
 
-            if (params.diarize && pcmf32s.size() == 2) {
-                const int64_t n_samples = pcmf32s[0].size();
+            double energy0 = 0.0f;
+            double energy1 = 0.0f;
 
-                const int64_t is0 = timestamp_to_sample(t0, n_samples);
-                const int64_t is1 = timestamp_to_sample(t1, n_samples);
+            for (int64_t j = is0; j < is1; j++) {
+                energy0 += fabs(pcmf32s[0][j]);
+                energy1 += fabs(pcmf32s[1][j]);
+            }
 
-                double energy0 = 0.0f;
-                double energy1 = 0.0f;
+            if (energy0 > 1.1*energy1) {
+                speaker = "(speaker 0)";
+            } else if (energy1 > 1.1*energy0) {
+                speaker = "(speaker 1)";
+            } else {
+                speaker = "(speaker ?)";
+            }
 
-                for (int64_t j = is0; j < is1; j++) {
-                    energy0 += fabs(pcmf32s[0][j]);
-                    energy1 += fabs(pcmf32s[1][j]);
-                }
+            //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
+        }
 
-                if (energy0 > 1.1*energy1) {
-                    speaker = "(speaker 0)";
-                } else if (energy1 > 1.1*energy0) {
-                    speaker = "(speaker 1)";
-                } else {
-                    speaker = "(speaker ?)";
+        if (params.print_colors) {
+            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                if (params.print_special == false) {
+                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                    if (id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
                 }
 
-                //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
-            }
-
-            if (params.print_colors) {
-                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special == false) {
-                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                        if (id >= whisper_token_eot(ctx)) {
-                            continue;
-                        }
-                    }
+                const char * text = whisper_full_get_token_text(ctx, i, j);
+                const float  p    = whisper_full_get_token_p   (ctx, i, j);
 
-                    const char * text = whisper_full_get_token_text(ctx, i, j);
-                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
+                const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
 
-                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
+                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
+            }
+        } else {
+            const char * text = whisper_full_get_segment_text(ctx, i);
 
-                    printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
-                }
-                printf("\n");
-            } else {
-                const char * text = whisper_full_get_segment_text(ctx, i);
+            printf("%s%s", speaker.c_str(), text);
+        }
 
-                printf("[%s --> %s]  %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
-            }
+        // with timestamps or speakers: each segment on new line
+        if (!params.no_timestamps || params.diarize) {
+            printf("\n");
         }
+
+        fflush(stdout);
     }
 }
 
@@ -557,7 +548,7 @@ int main(int argc, char ** argv) {
             }
 
             if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
-                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
+                fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", argv[0], fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
                 return 8;
             }
 
diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp
@@ -412,6 +412,8 @@ struct whisper_context {
     std::vector<uint8_t>   buf_compute;
     std::vector<uint8_t>   buf_compute_layer;
 
+    ggml_type wtype; // weight type (FP32 or FP16)
+
     whisper_model model;
     whisper_vocab vocab;
 
@@ -435,9 +437,8 @@ struct whisper_context {
 };
 
 template<typename T>
-static void read_safe(std::ifstream& fin, T& dest)
-{
-  fin.read((char*)& dest, sizeof(T));
+static void read_safe(std::ifstream& fin, T& dest) {
+    fin.read((char*)& dest, sizeof(T));
 }
 
 // load the model from a ggml file
@@ -630,7 +631,9 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
 
     // for the big tensors, we have the option to store the data in 16-bit floats
     // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+    const ggml_type wtype = wctx.wtype;
 
     size_t ctx_size = 0;
 
@@ -651,7 +654,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
 
         // encoder
         {
-            // TODO: F16 .. maybe not?
             ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe;
 
             ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype);         // e_conv_1_w
@@ -666,7 +668,6 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
 
         // decoder
         {
-            // TODO: F16 .. maybe not?
             ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe;
 
             ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te;
@@ -983,8 +984,8 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
             const int n_mem      = n_text_layer*n_text_ctx;
             const int n_elements = n_text_state*n_mem;
 
-            model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-            model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+            model.memory_k = ggml_new_tensor_1d(ctx, wtype, n_elements);
+            model.memory_v = ggml_new_tensor_1d(ctx, wtype, n_elements);
         }
 
         // key/value memory for the cross-attention layer
@@ -994,8 +995,8 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
             const int n_mem      = n_text_layer*n_audio_ctx;
             const int n_elements = n_text_state*n_mem;
 
-            model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-            model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+            model.memory_cross_k = ggml_new_tensor_1d(ctx, wtype, n_elements);
+            model.memory_cross_v = ggml_new_tensor_1d(ctx, wtype, n_elements);
         }
 
         const size_t memory_size =
@@ -1241,14 +1242,14 @@ static bool whisper_encode(
                 ggml_permute(ctxL,
                         ggml_cpy(ctxL,
                             Qcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
+                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
 
             struct ggml_tensor * K =
                 ggml_permute(ctxL,
                         ggml_cpy(ctxL,
                             Kcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
+                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
 
             struct ggml_tensor * V =
@@ -1258,7 +1259,7 @@ static bool whisper_encode(
                                 Vcur,
                                 n_state/n_head, n_head, n_ctx),
                             1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_ctx, n_state/n_head, n_head)
+                        ggml_new_tensor_3d(ctxL, wctx.wtype, n_ctx, n_state/n_head, n_head)
                         );
 
             struct ggml_tensor * KQV = ggml_flash_attn(ctxL, Q, K, V, false);
@@ -1274,7 +1275,7 @@ static bool whisper_encode(
                 ggml_permute(ctxL,
                         ggml_cpy(ctxL,
                             Kcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
+                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
 
             // K * Q
@@ -1292,7 +1293,7 @@ static bool whisper_encode(
             //    ggml_permute(ctxL,
             //            ggml_cpy(ctxL,
             //                Vcur,
-            //                ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
+            //                ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
             //            1, 2, 0, 3);
 
             //struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
@@ -1304,7 +1305,7 @@ static bool whisper_encode(
                                 Vcur,
                                 n_state/n_head, n_head, n_ctx),
                             0, 2, 1, 3),
-                        ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_ctx, n_head)
+                        ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_ctx, n_head)
                         );
 
             struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max);
@@ -1349,7 +1350,7 @@ static bool whisper_encode(
 
 #ifdef USE_FLASH_FF
             cur = ggml_flash_ff(ctxL,
-                    ggml_cpy(ctxL, cur, ggml_new_tensor_2d(ctxL, GGML_TYPE_F16, n_state, N)),
+                    ggml_cpy(ctxL, cur, ggml_new_tensor_2d(ctxL, wctx.wtype, n_state, N)),
                     layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
             // fully connected
@@ -2473,12 +2474,12 @@ int whisper_lang_auto_detect(
     }
 
     {
-        for (int i = 0; i < (int) probs_id.size(); i++) {
+        for (const auto & prob : probs_id) {
             if (lang_probs) {
-                lang_probs[probs_id[i].second] = probs_id[i].first;
+                lang_probs[prob.second] = prob.first;
             }
 
-            //printf("%s: lang %2d (%3s): %f\n", __func__, probs_id[i].second, whisper_lang_str(probs_id[i].second), probs_id[i].first);
+            //printf("%s: lang %2d (%3s): %f\n", __func__, prob.second, whisper_lang_str(prob.second), prob.first);
         }
     }
 
@@ -2581,6 +2582,8 @@ const char * whisper_print_system_info(void) {
     s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
     s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
     s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
+    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
 
     return s.c_str();
 }
@@ -3157,7 +3160,7 @@ int whisper_full_parallel(
 
         // separate key + value memory for each processor
         {
-            auto & ctx = model.ctx_mem;
+            auto & mctx = model.ctx_mem;
 
             const auto & hparams = model.hparams;
 
@@ -3170,8 +3173,8 @@ int whisper_full_parallel(
                 const int n_mem      = n_text_layer*n_text_ctx;
                 const int n_elements = n_text_state*n_mem;
 
-                model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-                model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+                model.memory_k = ggml_new_tensor_1d(mctx, ctx->wtype, n_elements);
+                model.memory_v = ggml_new_tensor_1d(mctx, ctx->wtype, n_elements);
             }
 
             // key/value memory for the cross-attention layer
@@ -3181,8 +3184,8 @@ int whisper_full_parallel(
                 const int n_mem      = n_text_layer*n_audio_ctx;
                 const int n_elements = n_text_state*n_mem;
 
-                model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-                model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
+                model.memory_cross_k = ggml_new_tensor_1d(mctx, ctx->wtype, n_elements);
+                model.memory_cross_v = ggml_new_tensor_1d(mctx, ctx->wtype, n_elements);
             }
         }
     }
@@ -3226,17 +3229,17 @@ int whisper_full_parallel(
     for (int i = 0; i < n_processors - 1; ++i) {
         auto & results_i = ctxs[i].result_all;
 
-        for (int j = 0; j < (int) results_i.size(); ++j) {
+        for (auto & result : results_i) {
             // correct the segment timestamp taking into account the offset
-            results_i[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
-            results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+            result.t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+            result.t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
 
             // make sure that segments are not overlapping
             if (!ctx->result_all.empty()) {
-                results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1);
+                result.t0 = std::max(result.t0, ctx->result_all.back().t1);
             }
 
-            ctx->result_all.push_back(std::move(results_i[j]));
+            ctx->result_all.push_back(std::move(result));
 
             // call the new_segment_callback for each segment
             if (params.new_segment_callback) {
@@ -3331,18 +3334,18 @@ static int64_t sample_to_timestamp(int i_sample) {
 static float voice_length(const std::string & text) {
     float res = 0.0f;
 
-    for (size_t i = 0; i < text.size(); ++i) {
-        if (text[i] == ' ') {
+    for (char c : text) {
+        if (c == ' ') {
             res += 0.01f;
-        } else if (text[i] == ',') {
+        } else if (c == ',') {
             res += 2.00f;
-        } else if (text[i] == '.') {
+        } else if (c == '.') {
             res += 3.00f;
-        } else if (text[i] == '!') {
+        } else if (c == '!') {
             res += 3.00f;
-        } else if (text[i] == '?') {
+        } else if (c == '?') {
             res += 3.00f;
-        } else if (text[i] >= '0' && text[i] <= '9') {
+        } else if (c >= '0' && c <= '9') {
             res += 3.00f;
         } else {
             res += 1.00f;
diff --git a/examples/whisper/whisper.h b/examples/whisper/whisper.h
@@ -148,7 +148,7 @@ extern "C" {
             struct whisper_context * ctx,
                         const char * text,
                      whisper_token * tokens,
-	                           int   n_max_tokens);
+                               int   n_max_tokens);
 
     // Largest language id (i.e. number of available languages - 1)
     WHISPER_API int whisper_lang_max_id();
diff --git a/src/ggml.c b/src/ggml.c