10
10
#include < cstdio>
11
11
#include < cstring>
12
12
#include < ctime>
13
+ #include < cinttypes>
13
14
#include < fstream>
14
15
#include < mutex>
15
16
#include < random>
@@ -103,7 +104,7 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
103
104
return probs;
104
105
}
105
106
106
- static results_log_softmax log_softmax (int n_vocab, const float * logits, int tok) {
107
+ static results_log_softmax log_softmax (int64_t n_vocab, const float * logits, int tok) {
107
108
float max_logit = logits[0 ];
108
109
for (int i = 1 ; i < n_vocab; ++i) {
109
110
max_logit = std::max (max_logit, logits[i]);
@@ -122,7 +123,7 @@ static inline int nearest_int(float fval) {
122
123
return (i & 0x007fffff ) - 0x00400000 ;
123
124
}
124
125
125
- static double log_softmax (int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
126
+ static double log_softmax (int64_t n_vocab, const float * logits, uint16_t * log_prob, int tok) {
126
127
float max_logit = logits[0 ];
127
128
float min_logit = logits[0 ];
128
129
for (int i = 1 ; i < n_vocab; ++i) {
@@ -153,7 +154,7 @@ static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob
153
154
}
154
155
155
156
static void process_logits (
156
- int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
157
+ int64_t n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
157
158
double & nll, double & nll2, float * logit_history, float * prob_history
158
159
) {
159
160
std::mutex mutex;
@@ -187,7 +188,7 @@ static void process_logits(
187
188
}
188
189
}
189
190
190
- static void process_logits (std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
191
+ static void process_logits (std::ostream& out, int64_t n_vocab, const float * logits, const int * tokens, int n_token,
191
192
std::vector<std::thread> & workers, std::vector<uint16_t > & log_probs, double & nll, double & nll2) {
192
193
std::mutex mutex;
193
194
const int nv = 2 *((n_vocab + 1 )/2 ) + 4 ;
@@ -234,7 +235,7 @@ struct kl_divergence_result {
234
235
size_t count = 0.0 ;
235
236
};
236
237
237
- static std::pair<double , float > log_softmax (int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
238
+ static std::pair<double , float > log_softmax (int64_t n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
238
239
float max_logit = logits[0 ];
239
240
int imax = 0 ;
240
241
for (int i = 1 ; i < n_vocab; ++i) {
@@ -281,7 +282,9 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
281
282
kld.sum_kld += sum;
282
283
kld.sum_kld2 += sum*sum;
283
284
++kld.count ;
284
- if (imax == imax_base) ++kld.n_same_top ;
285
+ if (imax == imax_base) {
286
+ ++kld.n_same_top ;
287
+ }
285
288
286
289
const float p_base = expf (-nll_base);
287
290
const float p = expf (-nll);
@@ -295,7 +298,7 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
295
298
return std::make_pair (sum, p_diff);
296
299
}
297
300
298
- static void process_logits (int n_vocab, const float * logits, const int * tokens, int n_token,
301
+ static void process_logits (int64_t n_vocab, const float * logits, const int * tokens, int n_token,
299
302
std::vector<std::thread> & workers, const std::vector<uint16_t > & base_log_probs, kl_divergence_result & kld,
300
303
float * kld_values, float * p_diff_values) {
301
304
std::mutex mutex;
@@ -383,9 +386,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
383
386
const int n_chunk_max = (tokens.size () - calc_chunk + params.ppl_stride - 1 ) / params.ppl_stride ;
384
387
385
388
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min (params.n_chunks , n_chunk_max);
386
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
387
389
const int n_batch = params.n_batch ;
388
390
391
+ const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
392
+
389
393
int count = 0 ;
390
394
double nll = 0.0 ;
391
395
@@ -521,9 +525,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
521
525
const int n_chunk_max = tokens.size () / n_ctx;
522
526
523
527
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min (params.n_chunks , n_chunk_max);
524
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
525
528
const int n_batch = params.n_batch ;
526
529
530
+ const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
531
+
527
532
int count = 0 ;
528
533
double nll = 0.0 ;
529
534
double nll2 = 0.0 ;
@@ -723,7 +728,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
723
728
724
729
#define K_TOKEN_CHUNK 4
725
730
726
- static void compute_logprobs (const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
731
+ static void compute_logprobs (const float * batch_logits, int64_t n_vocab, std::vector<std::thread>& workers,
727
732
const std::vector<std::pair<size_t , llama_token>>& eval_pairs, std::vector<float >& eval_results) {
728
733
if (eval_results.size () != eval_pairs.size ()) {
729
734
eval_results.resize (eval_pairs.size ());
@@ -877,10 +882,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
877
882
878
883
double acc = 0 .0f ;
879
884
880
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
881
885
const int n_ctx = llama_n_ctx (ctx);
882
886
const int n_batch = params.n_batch ;
883
887
888
+ const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
889
+
884
890
const int max_tasks_per_batch = 32 ;
885
891
const int max_seq = std::min (4 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
886
892
@@ -1158,10 +1164,11 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1158
1164
1159
1165
LOG_INF (" %s : calculating winogrande score over selected tasks.\n " , __func__);
1160
1166
1161
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
1162
1167
const int n_ctx = llama_n_ctx (ctx);
1163
1168
const int n_batch = params.n_batch ;
1164
1169
1170
+ const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
1171
+
1165
1172
const int max_tasks_per_batch = 128 ;
1166
1173
const int max_seq = std::min (2 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
1167
1174
@@ -1509,10 +1516,11 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1509
1516
1510
1517
LOG (" \n task\t acc_norm\n " );
1511
1518
1512
- const int n_vocab = llama_n_vocab (llama_get_model (ctx));
1513
1519
const int n_ctx = llama_n_ctx (ctx);
1514
1520
const int n_batch = params.n_batch ;
1515
1521
1522
+ const int64_t n_vocab = llama_n_vocab (llama_get_model (ctx));
1523
+
1516
1524
const int max_tasks_per_batch = 32 ;
1517
1525
const int max_seq = std::min (4 *max_tasks_per_batch, (int ) llama_n_seq_max (ctx));
1518
1526
@@ -1709,15 +1717,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1709
1717
__func__, params.logits_file .c_str (), n_ctx, params.n_ctx );
1710
1718
}
1711
1719
1712
- int n_vocab, n_chunk;
1720
+ int64_t n_vocab;
1721
+ int64_t n_chunk;
1713
1722
in.read ((char *)&n_vocab, sizeof (n_vocab));
1714
1723
in.read ((char *)&n_chunk, sizeof (n_chunk));
1715
1724
if (in.fail ()) {
1716
1725
LOG_ERR (" %s: failed reading n_vocab, n_chunk from %s\n " , __func__, params.logits_file .c_str ());
1717
1726
return ;
1718
1727
}
1719
1728
if (n_vocab != llama_n_vocab (llama_get_model (ctx))) {
1720
- LOG_ERR (" %s: inconsistent vocabulary (%d vs %d)\n " , __func__, n_vocab, llama_n_vocab (llama_get_model (ctx)));
1729
+ LOG_ERR (" %s: inconsistent vocabulary (%" PRId64 " vs %d)\n " , __func__, n_vocab, llama_n_vocab (llama_get_model (ctx)));
1721
1730
}
1722
1731
1723
1732
std::vector<llama_token> tokens (n_ctx * n_chunk);
0 commit comments