Skip to content

Commit 6459cab

Browse files
authored
perplexity.cpp : fix hellaswag handling of prepended spaces
1 parent e4324cb commit 6459cab

File tree

1 file changed

+15
-1
lines changed

1 file changed

+15
-1
lines changed

examples/perplexity/perplexity.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,13 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
462462
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
463463

464464
// Tokenize the query
465-
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
465+
// SPM tokenizer: Do not tokenize the starting space in the ending since it is always added by the tokenizer
466+
if (is_spm) {
467+
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx].substr(1,hs_data[task_idx].ending[ending_idx].size()-1), false);
468+
} else {
469+
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
470+
}
471+
466472
query_size = query_embd.size();
467473

468474
// Stop if query wont fit the ctx window
@@ -505,6 +511,14 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
505511
// task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
506512
}
507513

514+
// TODO: Temporary check for NaNs until Falcon MMQ is solved
515+
for (size_t j = 0; j < 4; j++) {
516+
if (std::isnan(hs_data[task_idx].ending_logprob[j])) {
517+
printf("NAN in task, %zu ending %zu\n",task_idx, j);
518+
return;
519+
}
520+
}
521+
508522
// Find the ending with maximum logprob
509523
size_t ending_logprob_max_idx = 0;
510524
double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];

0 commit comments

Comments
 (0)