perplexity binary support

JohannesGaessler · JohannesGaessler · commit d2d5254780bd · 2023-08-19T16:27:43.000+02:00
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -439,6 +439,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.logdir = argv[i];
+
+            if (params.logdir.back() != '/') {
+                params.logdir += "/";
+            }
         } else if (arg == "--perplexity") {
             params.perplexity = true;
         } else if (arg == "--hellaswag") {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -830,20 +830,18 @@ int main(int argc, char ** argv) {
     if (!params.logdir.empty()) {
         const std::string timestamp = get_sortable_timestamp();
 
-        std::string logdir = params.logdir;
-        const bool success = create_directory_with_parents(logdir);
+        const bool success = create_directory_with_parents(params.logdir);
         if (success) {
-            if (logdir.back() != '/') {
-                logdir += "/";
-            }
 
-            FILE * logfile = fopen((logdir + timestamp + ".yml").c_str(), "w");
+            FILE * logfile = fopen((params.logdir + timestamp + ".yml").c_str(), "w");
             fprintf(logfile, "binary: main\n");
             dump_non_result_info_yaml(logfile, params, timestamp, input_tokens);
-            llama_dump_result_info_yaml(logfile, ctx, output_ss.str().c_str(), output_tokens.data(), output_tokens.size());
+            llama_dump_result_info_yaml(
+                logfile, ctx, output_ss.str().c_str(), output_tokens.data(), output_tokens.size(), NULL, 0);
             fclose(logfile);
         } else {
-            fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", __func__, logdir.c_str());
+            fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                    __func__, params.logdir.c_str());
         }
     }
     if (ctx_guidance) { llama_free(ctx_guidance); }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -1,10 +1,13 @@
 #include "common.h"
 #include "llama.h"
+#include "llama-util.h"
 #include "build-info.h"
 
 #include <cmath>
 #include <ctime>
 #include <sstream>
+#include <utility>
+#include <vector>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -26,12 +29,13 @@ std::vector<float> softmax(const std::vector<float>& logits) {
     return probs;
 }
 
-void perplexity(llama_context * ctx, const gpt_params & params) {
+std::pair<std::vector<llama_token>, std::vector<float>> perplexity(llama_context * ctx, const gpt_params & params) {
     // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
     // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
     // BOS tokens will be added for each chunk before eval
-    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<float>       probs;
 
     const int n_chunk_max = tokens.size() / params.n_ctx;
 
@@ -68,7 +72,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
 
             if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
-                return;
+                return std::make_pair(tokens, probs);
             }
 
             // restore the original token in case it was set to BOS
@@ -110,6 +114,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
                 logits.begin() + (j + 1) * n_vocab);
 
             const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+            probs.push_back(prob);
 
             nll += -std::log(prob);
             ++count;
@@ -119,6 +124,8 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
         fflush(stdout);
     }
     printf("\n");
+
+    return std::make_pair(tokens, probs);
 }
 
 void hellaswag_score(llama_context * ctx, const gpt_params & params) {
@@ -341,13 +348,35 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
+    std::vector<llama_token> tokens;
+    std::vector<float>       probs;
     if (params.hellaswag) {
         hellaswag_score(ctx, params);
     } else {
-        perplexity(ctx, params);
+        auto ret = perplexity(ctx, params);
+        tokens = ret.first;
+        probs  = ret.second;
     }
 
     llama_print_timings(ctx);
+
+    if (!params.logdir.empty()) {
+        const std::string timestamp = get_sortable_timestamp();
+
+        const bool success = create_directory_with_parents(params.logdir);
+        if (success) {
+
+            FILE * logfile = fopen((params.logdir + timestamp + ".yml").c_str(), "w");
+            fprintf(logfile, "binary: perplexity\n");
+            dump_non_result_info_yaml(logfile, params, timestamp, tokens);
+            llama_dump_result_info_yaml(logfile, ctx, NULL, NULL, 0, probs.data(), probs.size());
+            fclose(logfile);
+        } else {
+            fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                    __func__, params.logdir.c_str());
+        }
+    }
+
     llama_free(ctx);
     llama_free_model(model);
 
diff --git a/llama-util.h b/llama-util.h
@@ -585,12 +585,6 @@ static bool create_directory_with_parents(const std::string & path) {
         pos_slash += 1;
     }
 
-    // finally, create the directory for the logs
-    const int ret = mkdir(path.c_str(), 0755);
-    if (ret != 0) {
-        return false;
-    }
-
     return true;
 }
 
@@ -612,7 +606,7 @@ static void dump_vector_int_yaml(FILE * stream, const char * prop_name, const st
 
 static void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data,
                                 const bool remove_first) {
-    std::string data_str(data);
+    std::string data_str(data == NULL ? "" : data);
 
     if (data_str.empty()) {
         fprintf(stream, "%s:\n", prop_name);
diff --git a/llama.cpp b/llama.cpp
@@ -4399,8 +4399,10 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
-void llama_dump_result_info_yaml(FILE * stream, const llama_context * ctx, const char * output_str,
-                             const int * output_tokens, const int n_output_tokens) {
+void llama_dump_result_info_yaml(
+    FILE * stream, const llama_context * ctx, const char * output_str, const int * output_tokens,
+    const int n_output_tokens, const float * probs, const int n_probs) {
+
     fprintf(stream, "\n");
     fprintf(stream, "###########\n");
     fprintf(stream, "# Results #\n");
@@ -4422,8 +4424,19 @@ void llama_dump_result_info_yaml(FILE * stream, const llama_context * ctx, const
     fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->n_sample);
     dump_string_yaml_multiline(stream, "output", output_str, false);
 
-    const std::vector<int> output_token_vector(output_tokens, output_tokens + n_output_tokens);
-    dump_vector_int_yaml(stream, "output_tokens", output_token_vector);
+    if (output_tokens == NULL) {
+        fprintf(stream, "output_tokens:\n");
+    } else {
+        const std::vector<int> output_token_vector(output_tokens, output_tokens + n_output_tokens);
+        dump_vector_int_yaml(stream, "output_tokens", output_token_vector);
+    }
+
+    if (probs == NULL) {
+        fprintf(stream, "probs:\n");
+    } else {
+        const std::vector<float> prob_vector(probs, probs + n_probs);
+        dump_vector_float_yaml(stream, "probs", prob_vector);
+    }
 
     fprintf(stream, "t_eval_us: %ld  # total microseconds spent generating tokens\n", ctx->t_eval_us);
     fprintf(stream, "t_load_us: %ld  # total microseconds spent loading the model\n", ctx->t_load_us);
diff --git a/llama.h b/llama.h
@@ -471,8 +471,9 @@ extern "C" {
     // Print system information
     LLAMA_API const char * llama_print_system_info(void);
 
-    LLAMA_API void llama_dump_result_info_yaml(FILE * stream, const llama_context * ctx, const char * output_str,
-                                           const int * output_tokens, int n_output_tokens);
+    LLAMA_API void llama_dump_result_info_yaml(
+        FILE * stream, const llama_context * ctx, const char * output_str, const int * output_tokens,
+        int n_output_tokens, const float * probs, int n_probs);
 
 #ifdef __cplusplus
 }
diff --git a/run_with_preset.py b/run_with_preset.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+import os
 import subprocess
 import sys
 
@@ -23,7 +24,11 @@
 
 props = {prop.replace("_", "-"): val for prop, val in props.items()}
 
-command_list = ["./main"]
+binary = props.pop("binary", "main")
+if os.path.exists(f"./{binary}"):
+    binary = f"./{binary}"
+
+command_list = [binary]
 
 for cli_arg in CLI_ARGS_MAIN:
     value = props.get(cli_arg, None)