@@ -603,6 +603,8 @@ struct whisper_context {
603
603
// [EXPERIMENTAL] speed-up techniques
604
604
int32_t exp_n_audio_ctx; // 0 - use default
605
605
606
+ std::vector<float > audio_embd;
607
+
606
608
void use_buf (struct ggml_context * ctx, int i) {
607
609
#if defined(WHISPER_USE_SCRATCH)
608
610
size_t last_size = 0 ;
@@ -1723,17 +1725,35 @@ static bool whisper_encode(
1723
1725
}
1724
1726
1725
1727
// cur
1728
+ // {
1729
+ // printf("ne0 = %d\n", cur->ne[0]);
1730
+ // printf("ne1 = %d\n", cur->ne[1]);
1731
+ // for (int i = 0; i < 10; ++i) {
1732
+ // printf("%8.4f ", ((float *)(cur->data))[i]);
1733
+ // }
1734
+ // printf("... ");
1735
+ // for (int i = cur->ne[0] - 10; i < cur->ne[0]; ++i) {
1736
+ // printf("%8.4f ", ((float *)(cur->data))[i]);
1737
+ // }
1738
+ // printf("\n");
1739
+ // }
1740
+
1726
1741
{
1727
- // printf("ne0 = %d\n", cur->ne[0]);
1728
- // printf("ne1 = %d\n", cur->ne[1]);
1729
- // for (int i = 0; i < 10; ++i) {
1730
- // printf("%8.4f ", ((float *)(cur->data))[i]);
1731
- // }
1732
- // printf("... ");
1733
- // for (int i = cur->ne[0] - 10; i < cur->ne[0]; ++i) {
1734
- // printf("%8.4f ", ((float *)(cur->data))[i]);
1735
- // }
1736
- // printf("\n");
1742
+ // const int i0 = std::min(mel_offset, mel_inp.n_len);
1743
+ // const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
1744
+ const int i0 = 0 ;
1745
+ const int i1 = cur->ne [1 ];
1746
+
1747
+ // printf("i0 = %d, i1 = %d, (i1 - i0) = %d, embd size = %d\n", i0, i1, i1 - i0, cur->ne[0]);
1748
+
1749
+ wctx.audio_embd .clear ();
1750
+ wctx.audio_embd .resize (cur->ne [0 ], 0 .0f );
1751
+ for (int j = 0 ; j < cur->ne [0 ]; ++j) {
1752
+ for (int i = i0; i < i1; ++i) {
1753
+ wctx.audio_embd [j] += ((float *)(cur->data ))[(i - i0)*cur->ne [0 ] + j];
1754
+ }
1755
+ wctx.audio_embd [j] /= (i1 - i0);
1756
+ }
1737
1757
}
1738
1758
1739
1759
// pre-compute cross-attention memory
@@ -4838,6 +4858,28 @@ void whisper_full_cluster_segments(struct whisper_context * ctx) {
4838
4858
const int n_state = ctx->model .hparams .n_audio_state ;
4839
4859
const int n_layer = ctx->model .hparams .n_audio_layer ;
4840
4860
4861
+ #if 1
4862
+ // use the last layer of the encoder
4863
+ {
4864
+ std::vector<float > embd (n_segments*n_state);
4865
+
4866
+ for (int i = 0 ; i < n_segments; ++i) {
4867
+ const auto & segment_i = ctx->result_all [i];
4868
+ printf (" %s: segment %3d: t0 = %7d, t1 = %7d, text = %s\n " , __func__, i, (int ) segment_i.t0 , (int ) segment_i.t1 , segment_i.text .c_str ());
4869
+
4870
+ ctx->mel .n_len = segment_i.t1 ;
4871
+ whisper_encode (*ctx, segment_i.t0 , 7 , true );
4872
+
4873
+ for (int j = 0 ; j < n_state; ++j) {
4874
+ embd[i*n_state + j] = ctx->audio_embd [j];
4875
+ }
4876
+ }
4877
+
4878
+ const int n_features = std::min (4 , n_segments);
4879
+
4880
+ ggml_svd_reduce_dims (n_state, n_segments, embd.data (), n_features);
4881
+ #else
4882
+ // use cross kv cache of various layers
4841
4883
for (int il = 0 ; il < n_layer; ++il) {
4842
4884
std::vector<float > embd (n_segments*n_ctx*n_state);
4843
4885
@@ -4856,9 +4898,10 @@ void whisper_full_cluster_segments(struct whisper_context * ctx) {
4856
4898
}
4857
4899
}
4858
4900
4859
- const int n_features = 64 ;
4901
+ const int n_features = std::min ( 4 , n_segments) ;
4860
4902
4861
4903
ggml_svd_reduce_dims (n_ctx*n_state, n_segments, embd.data (), n_features);
4904
+ #endif
4862
4905
4863
4906
std::vector<std::vector<float >> features (n_segments);
4864
4907
@@ -4927,32 +4970,59 @@ void whisper_full_cluster_segments(struct whisper_context * ctx) {
4927
4970
for (int l = 0 ; l < n_clusters; ++l) {
4928
4971
// sum += std::pow(whisper_distance(features[j], centroids[k])/whisper_distance(features[j], centroids[l]), 2.0/(2.0 - 1.0));
4929
4972
4930
- // use the euclidean distance
4931
4973
double d0 = 0.0 ;
4932
- for (int m = 0 ; m < n_features; ++m) {
4933
- d0 += std::pow (features[j][m] - centroids[k][m], 2.0 );
4934
- }
4935
- d0 = std::sqrt (d0);
4936
-
4937
4974
double d1 = 0.0 ;
4938
- for (int m = 0 ; m < n_features; ++m) {
4939
- d1 += std::pow (features[j][m] - centroids[l][m], 2.0 );
4940
- }
4941
- d1 = std::sqrt (d1);
4942
4975
4943
- if (d1 == 0.0 ) {
4944
- sum += 1.0 ;
4945
- } else {
4946
- sum += std::pow (d0/d1, 2.0 /(1.10 - 1.0 ));
4976
+ // use the euclidean distance
4977
+ {
4978
+ for (int m = 0 ; m < n_features; ++m) {
4979
+ d0 += std::pow (features[j][m] - centroids[k][m], 2.0 );
4980
+ }
4981
+ d0 = std::sqrt (d0);
4982
+
4983
+ for (int m = 0 ; m < n_features; ++m) {
4984
+ d1 += std::pow (features[j][m] - centroids[l][m], 2.0 );
4985
+ }
4986
+ d1 = std::sqrt (d1);
4947
4987
}
4988
+
4989
+ // use the cosine distance
4990
+ // {
4991
+ // double dot = 0.0;
4992
+ // double norm0 = 0.0;
4993
+ // double norm1 = 0.0;
4994
+
4995
+ // for (int m = 0; m < n_features; ++m) {
4996
+ // dot += features[j][m]*centroids[k][m];
4997
+ // norm0 += std::pow(features[j][m], 2.0);
4998
+ // norm1 += std::pow(centroids[k][m], 2.0);
4999
+ // }
5000
+
5001
+ // d0 = 1.0 - dot/(std::sqrt(norm0)*std::sqrt(norm1));
5002
+
5003
+ // dot = 0.0;
5004
+ // norm0 = 0.0;
5005
+ // norm1 = 0.0;
5006
+
5007
+ // for (int m = 0; m < n_features; ++m) {
5008
+ // dot += features[j][m]*centroids[l][m];
5009
+ // norm0 += std::pow(features[j][m], 2.0);
5010
+ // norm1 += std::pow(centroids[l][m], 2.0);
5011
+ // }
5012
+
5013
+ // d1 = 1.0 - dot/(std::sqrt(norm0)*std::sqrt(norm1));
5014
+ // }
5015
+
5016
+ sum += std::pow (d0/d1, 2.0 /(1.15 - 1.0 ));
4948
5017
}
4949
5018
4950
- membership[j][k] = 1.0 /sum;
5019
+ membership[j][k] = sum == 0.0 ? 0.0 : 1.0 /sum;
4951
5020
}
4952
5021
}
4953
5022
4954
5023
// print the membership
4955
5024
if (i == niter - 1 ) {
5025
+ // {
4956
5026
for (int i = 0 ; i < n_segments; ++i) {
4957
5027
printf (" %s: membership %3d: " , __func__, i);
4958
5028
for (int j = 0 ; j < n_clusters; ++j) {
0 commit comments