Add llama_beam_search_callback_fn_t and improve comments.

mattpulver · mattpulver · commit 29ec1a04663a · 2023-08-06T12:29:42.000-04:00
diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp
@@ -28,6 +28,22 @@
 #endif
 
 
+// Custom callback example is called each time the beams lengths increase:
+//  * Show progress by printing ',' following by number of convergent beam tokens if any.
+//  * When all beams converge to a common prefix, they are made available in tokens_view.
+//    This is also called when the stop condition is met, in which case the beam with the
+//    highest probability is chosen, and its remaining tokens are available in tokens_view.
+//    Collect them into std::vector<llama_token> response which is pointed to by callback_state.
+void beam_search_callback(void* callback_state, llama_tokens_view const tokens_view) {
+    printf(",");  // Show progress
+    if (size_t const n = tokens_view.size) {
+        auto* response = static_cast<std::vector<llama_token>*>(callback_state);
+        response->resize(response->size() + n);
+        std::copy(tokens_view.tokens, tokens_view.tokens + n, response->end() - n);
+        printf("%lu", n);
+    }
+    fflush(stdout);
+}
 
 int main(int argc, char ** argv)
 {
@@ -115,9 +131,15 @@ int main(int argc, char ** argv)
     }
     n_past += tokens_list.size();
 
-    int const n_predict = 1024;
-    char const* response = llama_beam_search(ctx, params.n_beams, n_past, n_predict, params.n_threads);
-    printf("\nDone:\n\n%s%s\n", params.prompt.c_str(), response);
+    std::vector<llama_token> response;
+    int const n_predict = 256;
+    llama_beam_search(ctx, beam_search_callback, &response, params.n_beams, n_past, n_predict, params.n_threads);
+
+    printf("\n\n");
+    for (llama_token const token_id : response) {
+        printf("%s", llama_token_to_str(ctx,token_id));
+    }
+    printf("\n");
 #else
     //---------------------------------
     // Main prediction loop :
diff --git a/llama.cpp b/llama.cpp
@@ -2962,8 +2962,8 @@ struct beam_search {
     int common_prefix_length;
     // true iff llama_eval() has been called with common prefix in current loop iteration.
     bool common_prefix_evaluated;
-    // Save token prefix common to all beams here
-    std::vector<llama_token> response;
+    // Save token prefix common to all beams. Cleared after each loop iteration.
+    std::vector<llama_token> common_prefix;
 
     beam_search(llama_context * ctx, int beam_width, int n_past, int n_predict, int n_threads)
       : ctx(ctx)
@@ -2990,11 +2990,11 @@ struct beam_search {
         return common_prefix_length;
     }
 
-    // Min-heaps are used to efficiently gather the top-k elements (k=beam_width).
+    // Min-heaps are used to efficiently collect the top-k elements (k=beam_width).
     // The repetative patterns below reflect the 2 stages of heaps:
     //  * Gather elements until the vector is full, then call std::make_heap() on it.
-    //  * If the heap is full and a new element is found that should be included,
-    //    pop off the least element, replace it with the new, then push it into the heap.
+    //  * If the heap is full and a new element is found that should be included, pop the
+    //    least element to the back(), replace it with the new, then push it into the heap.
     void fill_next_beams_by_top_probabilities(beam& b) {
         // Min-heaps use a greater-than comparator.
         auto const comp = [](beam const& a, beam const& b) { return a.p > b.p; };
@@ -3006,7 +3006,7 @@ struct beam_search {
         if (b.eos()) {
             // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
             if (next_beams.size() < static_cast<size_t>(beam_width)) {
-                next_beams.push_back(b);
+                next_beams.push_back(std::move(b));
                 if (next_beams.size() == static_cast<size_t>(beam_width)) {
                     std::make_heap(next_beams.begin(), next_beams.end(), comp);
                 }
@@ -3020,7 +3020,7 @@ struct beam_search {
             if (!b.tokens.empty()) {
                 llama_eval(ctx, b.tokens.data(), b.tokens.size(), n_past, n_threads);
                 if (!common_prefix_evaluated && common_prefix_length) {
-                    b.shift_tokens(response, common_prefix_length);
+                    b.shift_tokens(common_prefix, common_prefix_length);
                     n_past += common_prefix_length;
                     common_prefix_evaluated = true;
                 }
@@ -3059,12 +3059,12 @@ struct beam_search {
     }
 
     // Loop:
-    //  * while i < n_predict
-    //  * until all of the beams have nreached end-of-sentence
+    //  * while i < n_predict, OR
+    //  * until all of the beams have nreached end-of-sentence, OR
     //  * until the highest probability beam is at end-of-sentence
     //    (since all other beam probabilities can only decrease)
-    void loop(std::function<void(std::vector<beam>&)> const callback) {
-        beams.push_back({{}, 1.0f});
+    void loop(llama_beam_search_callback_fn_t const callback, void* const callback_state) {
+        beams.push_back({{}, 1.0f});  // Start with one empty beam w/ probability = 1.0.
         auto const eos = [](beam const& beam) { return beam.eos(); };
         for (int i=0 ; i<n_predict && !std::all_of(beams.begin(),beams.end(),eos) && !eos(top_beam()) ; ++i) {
             common_prefix_evaluated = false;
@@ -3075,10 +3075,14 @@ struct beam_search {
             beams.swap(next_beams);
             renormalize_beam_probabilities(beams);
             std::for_each(next_beams.begin(), next_beams.end(), [](beam& beam) { beam.p = 0.0f; });
-            callback(beams);
+            llama_tokens_view const common_beam_prefix{common_prefix.data(), common_prefix.size()};
+            callback(callback_state, common_beam_prefix);
+            common_prefix.clear();
         }
         beam& top_b = top_beam();
-        top_b.shift_tokens(response, top_b.tokens.size());
+        top_b.shift_tokens(common_prefix, top_b.tokens.size());
+        llama_tokens_view const common_beam_prefix{common_prefix.data(), common_prefix.size()};
+        callback(callback_state, common_beam_prefix);
     }
 
     // As beams grow, the cumulative probabilities decrease.
@@ -3096,38 +3100,20 @@ struct beam_search {
     }
 };
 
-// Not thread-safe.
-const char* llama_beam_search(llama_context * ctx, int beam_width,
-                              int n_past, int const n_predict, int const n_threads) {
-    static std::string beam_search_response;
+void llama_beam_search(llama_context * ctx,
+                       llama_beam_search_callback_fn_t callback, void* callback_state,
+                       int beam_width, int n_past, int const n_predict, int const n_threads) {
     assert(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
 
     beam_search beam_search(ctx, beam_width, n_past, n_predict, n_threads);
 
-    beam_search.loop([&](std::vector<beam>& beams) {
-#if 1 // DEBUG: print current beams for this iteration
-        std::cout << "\n\nCurrent beams:\n";
-        for (size_t j=0 ; j < beams.size() ; ++j) {
-            std::cout << "beams["<<j<<"]: ";
-            out_beam(std::cout, ctx, beams[j]);
-            std::cout << std::endl;
-        }
-#else
-        std::cout << '.' << std::flush;  // Show progress
-#endif
-    });
-
-    // Save beam sentence to beam_search_response. Is there a better way?
-    std::ostringstream oss;
-    for (llama_token const token : beam_search.response) {
-        oss << llama_token_to_str(ctx, token);
-    }
-    beam_search_response = oss.str();
+    // callback(callback_state, common_beam_prefix) is called on each iteration, and when
+    // stop condition is met with remaining tokens from beam with the highest probability.
+    beam_search.loop(callback, callback_state);
 
     ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
     ctx->n_sample++;
-    return beam_search_response.c_str();
 }
 
 //
diff --git a/llama.h b/llama.h
@@ -443,13 +443,19 @@ extern "C" {
     /// @details Accepts the sampled token into the grammar
     LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
 
+    struct llama_tokens_view {
+        llama_token const* tokens;
+        size_t size;
+    };
+    typedef void (*llama_beam_search_callback_fn_t)(void* state, llama_tokens_view const);
+
     /// @details Deterministically returns entire sentence constructed by a beam search.
     /// @param ctx Pointer to the llama_context.
     /// @param beam_width The number of parallel beams to use.
     /// @param n_past The number of tokens already evaluated.
     /// @param n_predict The maximum number of tokens to predict.
     /// @param n_threads The maximum number of threads as passed to llama_eval().
-    LLAMA_API const char* llama_beam_search(struct llama_context * ctx, int beam_width, int n_past, int n_predict, int n_threads);
+    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void* callback_state, int beam_width, int n_past, int n_predict, int n_threads);
 
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);