@@ -169,7 +169,7 @@ static void process_logits(
169
169
break ;
170
170
}
171
171
lock.unlock ();
172
- const results_log_softmax results = log_softmax (n_vocab, logits + i *n_vocab, tokens[i+1 ]);
172
+ const results_log_softmax results = log_softmax (n_vocab, logits + size_t (i) *n_vocab, tokens[i+1 ]);
173
173
const double v = -results.log_softmax ;
174
174
local_nll += v;
175
175
local_nll2 += v*v;
@@ -203,7 +203,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
203
203
break ;
204
204
}
205
205
lock.unlock ();
206
- const double v = log_softmax (n_vocab, logits + i *n_vocab, log_probs.data () + i*nv, tokens[i+1 ]);
206
+ const double v = log_softmax (n_vocab, logits + size_t (i) *n_vocab, log_probs.data () + i*nv, tokens[i+1 ]);
207
207
local_nll += v;
208
208
local_nll2 += v*v;
209
209
}
@@ -281,7 +281,9 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
281
281
kld.sum_kld += sum;
282
282
kld.sum_kld2 += sum*sum;
283
283
++kld.count ;
284
- if (imax == imax_base) ++kld.n_same_top ;
284
+ if (imax == imax_base) {
285
+ ++kld.n_same_top ;
286
+ }
285
287
286
288
const float p_base = expf (-nll_base);
287
289
const float p = expf (-nll);
@@ -323,7 +325,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
323
325
break ;
324
326
}
325
327
lock.unlock ();
326
- std::pair<double , float > v = log_softmax (n_vocab, logits + i *n_vocab, base_log_probs.data () + i*nv, tokens[i+1 ], local_kld);
328
+ std::pair<double , float > v = log_softmax (n_vocab, logits + size_t (i) *n_vocab, base_log_probs.data () + i*nv, tokens[i+1 ], local_kld);
327
329
kld_values[i] = (float )v.first ;
328
330
p_diff_values[i] = v.second ;
329
331
}
@@ -383,9 +385,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
383
385
const int n_chunk_max = (tokens.size () - calc_chunk + params.ppl_stride - 1 ) / params.ppl_stride ;
384
386
385
387
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min (params.n_chunks , n_chunk_max);
386
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
387
388
const int n_batch = params.n_batch ;
388
389
390
+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
391
+
389
392
int count = 0 ;
390
393
double nll = 0.0 ;
391
394
@@ -424,8 +427,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
424
427
tokens[batch_start] = llama_token_bos (llama_get_model (ctx));
425
428
}
426
429
427
- const auto batch_logits = llama_get_logits (ctx);
428
- logits.insert (logits.end (), batch_logits, batch_logits + batch_size * n_vocab);
430
+ const auto * batch_logits = llama_get_logits (ctx);
431
+ logits.insert (logits.end (), batch_logits, batch_logits + size_t ( batch_size) * n_vocab);
429
432
430
433
if (j == 0 ) {
431
434
tokens[batch_start] = token_org;
@@ -447,11 +450,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
447
450
448
451
// LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
449
452
for (int j = n_ctx - params.ppl_stride - 1 ; j < n_ctx - 1 ; ++j) {
450
-
451
453
// Calculate probability of next token, given the previous ones.
452
454
const std::vector<float > tok_logits (
453
- logits.begin () + (j + 0 ) * n_vocab,
454
- logits.begin () + (j + 1 ) * n_vocab);
455
+ logits.begin () + size_t (j + 0 ) * n_vocab,
456
+ logits.begin () + size_t (j + 1 ) * n_vocab);
455
457
456
458
const float prob = softmax (tok_logits)[tokens[start + j + 1 ]];
457
459
logit_history[start + j + 1 ] = tok_logits[tokens[start + j + 1 ]];
@@ -521,9 +523,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
521
523
const int n_chunk_max = tokens.size () / n_ctx;
522
524
523
525
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min (params.n_chunks , n_chunk_max);
524
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
525
526
const int n_batch = params.n_batch ;
526
527
528
+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
529
+
527
530
int count = 0 ;
528
531
double nll = 0.0 ;
529
532
double nll2 = 0.0 ;
@@ -538,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
538
541
539
542
std::vector<float > logits;
540
543
if (num_batches > 1 ) {
541
- logits.reserve (( size_t ) n_ctx * n_vocab);
544
+ logits.reserve (size_t ( n_ctx) * n_vocab);
542
545
}
543
546
544
547
LOG_INF (" %s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n " , __func__, n_chunk, n_ctx, n_batch, n_seq);
@@ -620,7 +623,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
620
623
621
624
if (num_batches > 1 && n_outputs > 0 ) {
622
625
const auto * batch_logits = llama_get_logits (ctx);
623
- logits.insert (logits.end (), batch_logits, batch_logits + n_outputs * n_vocab);
626
+ logits.insert (logits.end (), batch_logits, batch_logits + size_t ( n_outputs) * n_vocab);
624
627
}
625
628
}
626
629
@@ -661,7 +664,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
661
664
} else {
662
665
double av = nll/count;
663
666
double av2 = nll2/count - av*av;
664
- if (av2 > 0 ) av2 = sqrt (av2/(count-1 ));
667
+ if (av2 > 0 ) {
668
+ av2 = sqrt (av2/(count-1 ));
669
+ }
665
670
LOG (" %8d %.4lf %4lf %4lf\n " , i*n_ctx, std::exp (nll / count), av, av2);
666
671
}
667
672
}
@@ -686,10 +691,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
686
691
return {tokens, ppl, logit_history, prob_history};
687
692
}
688
693
689
- static bool decode_helper (llama_context * ctx, llama_batch & batch, std::vector<float > & batch_logits, int32_t n_batch, int32_t n_vocab) {
694
+ static bool decode_helper (llama_context * ctx, llama_batch & batch, std::vector<float > & batch_logits, int n_batch, int n_vocab) {
690
695
int prev_outputs = 0 ;
691
- for (int32_t i = 0 ; i < (int32_t ) batch.n_tokens ; i += n_batch) {
692
- const int32_t n_tokens = std::min (n_batch, ( int32_t ) ( batch.n_tokens - i) );
696
+ for (int i = 0 ; i < (int ) batch.n_tokens ; i += n_batch) {
697
+ const int n_tokens = std::min< int > (n_batch, batch.n_tokens - i);
693
698
694
699
llama_batch batch_view = {
695
700
n_tokens,
@@ -713,7 +718,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
713
718
n_outputs += batch_view.logits [i] != 0 ;
714
719
}
715
720
716
- memcpy (batch_logits.data () + prev_outputs*n_vocab, llama_get_logits (ctx), n_outputs*n_vocab*sizeof (float ));
721
+ memcpy (batch_logits.data () + size_t ( prev_outputs) *n_vocab, llama_get_logits (ctx), size_t ( n_outputs) *n_vocab*sizeof (float ));
717
722
718
723
prev_outputs += n_outputs;
719
724
}
@@ -728,19 +733,23 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
728
733
if (eval_results.size () != eval_pairs.size ()) {
729
734
eval_results.resize (eval_pairs.size ());
730
735
}
731
- if (eval_pairs.empty ()) return ;
736
+ if (eval_pairs.empty ()) {
737
+ return ;
738
+ }
732
739
733
740
size_t max_threads = std::min ((eval_pairs.size () + K_TOKEN_CHUNK - 1 )/K_TOKEN_CHUNK, workers.size ());
734
741
735
742
std::atomic<int > counter (0 );
736
743
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
737
744
float local_logprobs[K_TOKEN_CHUNK];
738
745
while (true ) {
739
- size_t first = counter.fetch_add (K_TOKEN_CHUNK, std::memory_order_relaxed);
740
- if (first >= eval_results.size ()) break ;
741
- size_t last = std::min (first + K_TOKEN_CHUNK, eval_results.size ());
746
+ const size_t first = counter.fetch_add (K_TOKEN_CHUNK, std::memory_order_relaxed);
747
+ if (first >= eval_results.size ()) {
748
+ break ;
749
+ }
750
+ const size_t last = std::min (first + K_TOKEN_CHUNK, eval_results.size ());
742
751
for (size_t i = first; i < last; ++i) {
743
- auto logits = batch_logits + eval_pairs[i].first * n_vocab;
752
+ const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
744
753
float max_logit = logits[0 ];
745
754
for (int j = 1 ; j < n_vocab; ++j) {
746
755
max_logit = std::max (max_logit, logits[j]);
@@ -877,18 +886,19 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
877
886
878
887
double acc = 0 .0f ;
879
888
880
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
881
889
const int n_ctx = llama_n_ctx (ctx);
882
890
const int n_batch = params.n_batch ;
883
891
892
+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
893
+
884
894
const int max_tasks_per_batch = 32 ;
885
895
const int max_seq = std::min (4 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
886
896
887
897
llama_batch batch = llama_batch_init (n_ctx, 0 , 4 );
888
898
889
899
std::vector<float > tok_logits (n_vocab);
890
900
// TODO: this could be made smaller; it's currently the worst-case size
891
- std::vector<float > batch_logits (n_vocab* n_ctx);
901
+ std::vector<float > batch_logits (size_t ( n_ctx)*n_vocab );
892
902
893
903
std::vector<std::pair<size_t , llama_token>> eval_pairs;
894
904
std::vector<float > eval_results;
@@ -975,7 +985,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
975
985
auto & hs_cur = hs_data[i];
976
986
977
987
// get the logits of the last token of the common prefix
978
- std::memcpy (tok_logits.data (), batch_logits.data () + n_vocab* hs_cur.i_logits , n_vocab*sizeof (float ));
988
+ std::memcpy (tok_logits.data (), batch_logits.data () + hs_cur.i_logits *n_vocab , n_vocab*sizeof (float ));
979
989
980
990
const auto first_probs = softmax (tok_logits);
981
991
@@ -1158,18 +1168,19 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1158
1168
1159
1169
LOG_INF (" %s : calculating winogrande score over selected tasks.\n " , __func__);
1160
1170
1161
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
1162
1171
const int n_ctx = llama_n_ctx (ctx);
1163
1172
const int n_batch = params.n_batch ;
1164
1173
1174
+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
1175
+
1165
1176
const int max_tasks_per_batch = 128 ;
1166
1177
const int max_seq = std::min (2 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
1167
1178
1168
1179
llama_batch batch = llama_batch_init (n_ctx, 0 , 2 );
1169
1180
1170
1181
std::vector<float > tok_logits (n_vocab);
1171
1182
// TODO: this could be made smaller; it's currently the worst-case size
1172
- std::vector<float > batch_logits (n_vocab* n_ctx);
1183
+ std::vector<float > batch_logits (size_t ( n_ctx)*n_vocab );
1173
1184
1174
1185
std::vector<std::pair<size_t , llama_token>> eval_pairs;
1175
1186
std::vector<float > eval_results;
@@ -1509,17 +1520,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1509
1520
1510
1521
LOG (" \n task\t acc_norm\n " );
1511
1522
1512
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
1513
1523
const int n_ctx = llama_n_ctx (ctx);
1514
1524
const int n_batch = params.n_batch ;
1515
1525
1526
+ const int n_vocab = llama_n_vocab (llama_get_model (ctx));
1527
+
1516
1528
const int max_tasks_per_batch = 32 ;
1517
1529
const int max_seq = std::min (4 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
1518
1530
1519
1531
llama_batch batch = llama_batch_init (n_ctx, 0 , max_seq);
1520
1532
1521
1533
std::vector<float > tok_logits (n_vocab);
1522
- std::vector<float > batch_logits (n_vocab* n_ctx);
1534
+ std::vector<float > batch_logits (size_t ( n_ctx)*n_vocab );
1523
1535
1524
1536
std::vector<std::pair<size_t , llama_token>> eval_pairs;
1525
1537
std::vector<float > eval_results;
@@ -1627,7 +1639,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1627
1639
// LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
1628
1640
1629
1641
// get the logits of the last token of the common prefix
1630
- std::memcpy (tok_logits.data (), batch_logits.data () + n_vocab* cur_task.i_logits , n_vocab*sizeof (float ));
1642
+ std::memcpy (tok_logits.data (), batch_logits.data () + cur_task.i_logits *n_vocab , n_vocab*sizeof (float ));
1631
1643
1632
1644
const auto first_probs = softmax (tok_logits);
1633
1645
@@ -1709,7 +1721,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1709
1721
__func__, params.logits_file .c_str (), n_ctx, params.n_ctx );
1710
1722
}
1711
1723
1712
- int n_vocab, n_chunk;
1724
+ int n_vocab;
1725
+ int n_chunk;
1713
1726
in.read ((char *)&n_vocab, sizeof (n_vocab));
1714
1727
in.read ((char *)&n_chunk, sizeof (n_chunk));
1715
1728
if (in.fail ()) {
@@ -1720,7 +1733,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1720
1733
LOG_ERR (" %s: inconsistent vocabulary (%d vs %d)\n " , __func__, n_vocab, llama_n_vocab (llama_get_model (ctx)));
1721
1734
}
1722
1735
1723
- std::vector<llama_token> tokens (n_ctx * n_chunk);
1736
+ std::vector<llama_token> tokens (size_t ( n_ctx) * n_chunk);
1724
1737
if (in.read ((char *)tokens.data (), tokens.size ()*sizeof (tokens[0 ])).fail ()) {
1725
1738
LOG_ERR (" %s: failed reading evaluation tokens from %s\n " , __func__, params.logits_file .c_str ());
1726
1739
return ;
@@ -1737,7 +1750,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1737
1750
std::vector<float > p_diff_values (size_t (n_ctx - 1 - n_ctx/2 )*n_chunk);
1738
1751
std::vector<float > logits;
1739
1752
if (num_batches > 1 ) {
1740
- logits.reserve (n_ctx * n_vocab);
1753
+ logits.reserve (size_t ( n_ctx) * n_vocab);
1741
1754
}
1742
1755
1743
1756
std::vector<std::thread> workers (std::thread::hardware_concurrency () - 1 );
@@ -1801,7 +1814,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1801
1814
1802
1815
if (num_batches > 1 ) {
1803
1816
const auto * batch_logits = llama_get_logits (ctx);
1804
- logits.insert (logits.end (), batch_logits, batch_logits + batch_size * n_vocab);
1817
+ logits.insert (logits.end (), batch_logits, batch_logits + size_t ( batch_size) * n_vocab);
1805
1818
}
1806
1819
}
1807
1820
@@ -1822,7 +1835,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1822
1835
1823
1836
const int first = n_ctx/2 ;
1824
1837
const float * all_logits = num_batches > 1 ? logits.data () : llama_get_logits (ctx);
1825
- process_logits (n_vocab, all_logits + first*n_vocab, tokens.data () + start + first, n_ctx - 1 - first,
1838
+ process_logits (n_vocab, all_logits + size_t ( first) *n_vocab, tokens.data () + start + first, n_ctx - 1 - first,
1826
1839
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
1827
1840
p_diff_ptr += n_ctx - 1 - first;
1828
1841
kld_ptr += n_ctx - 1 - first;
0 commit comments