Improve beam_search callback pattern by giving access to beams_state, and to return a beam_search_control struct to control execution.

mattpulver · mattpulver · commit f20e584d9d99 · 2023-08-16T10:56:15.000-04:00
diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp
@@ -28,21 +28,29 @@
 #endif
 
 
+// Function matching type llama_beam_search_callback_fn_t.
 // Custom callback example is called each time the beams lengths increase:
 //  * Show progress by printing ',' following by number of convergent beam tokens if any.
-//  * When all beams converge to a common prefix, they are made available in tokens_view.
-//    This is also called when the stop condition is met, in which case the beam with the
-//    highest probability is chosen, and its remaining tokens are available in tokens_view.
-//    Collect them into std::vector<llama_token> response which is pointed to by callback_state.
-void beam_search_callback(void* callback_state, llama_tokens_view const tokens_view) {
+//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
+//    This is also called when the stop condition is met.
+//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_state.
+beam_search_control beam_search_callback(void* callback_state, beams_state const beams_state) {
     printf(",");  // Show progress
-    if (size_t const n = tokens_view.size) {
+    if (size_t const n = beams_state.common_prefix_length) {
         auto* response = static_cast<std::vector<llama_token>*>(callback_state);
         response->resize(response->size() + n);
-        std::copy(tokens_view.tokens, tokens_view.tokens + n, response->end() - n);
+        assert(0u < beams_state.n_beams);
+        std::copy(beams_state.beams[0], beams_state.beams[0] + n, response->end() - n);
         printf("%lu", n);
     }
     fflush(stdout);
+#if 0 // DEBUG: print current beams for this iteration
+            std::cout << "\n\nCurrent beams:\n";
+            for (size_t j=0 ; j < beams.size() ; ++j) {
+                std::cout << "beams["<<j<<"]: " << ostream_beam{ctx,beams[j]} << std::endl;
+            }
+#endif
+    return { beams_state.n_beams, false };  // Continue beam search.
 }
 
 int main(int argc, char ** argv)
diff --git a/llama.cpp b/llama.cpp
@@ -2977,30 +2977,28 @@ struct beam_search {
     // true iff llama_eval() has been called with non-empty common prefix in current loop iteration.
     bool common_prefix_evaluated;
 
+    // Memory used by beam_state
+    std::vector<size_t> beam_lengths;
+    std::vector<llama_token const*> beam_ptrs;
+
     beam_search(llama_context * ctx, int beam_width, int n_past, int n_predict, int n_threads)
       : ctx(ctx)
       , beam_width(beam_width)
       , n_past(n_past)
       , n_predict(n_predict)
-      , n_threads(n_threads) {
+      , n_threads(n_threads)
+      , beam_lengths(beam_width)
+      , beam_ptrs(beam_width) {
         beams.reserve(beam_width);
         next_beams.reserve(beam_width);
     }
 
-    // Find common_prefix_length based on beams.
-    // Requires beams is not empty.
-    size_t find_common_prefix_length() {
-        size_t common_prefix_length = beams[0].tokens.size();
-        for (size_t i=1 ; i<beams.size() ; ++i) {
-            common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
-            for (size_t j=0 ; j<common_prefix_length ; ++j) {
-                if (beams[0].tokens[j] != beams[i].tokens[j]) {
-                    common_prefix_length = j;
-                    break;
-                }
-            }
+    // Collapse beams to a single beam given by index.
+    void collapse_beams(size_t const beam_idx) {
+        if (0u < beam_idx) {
+            std::swap(beams[0], beams[beam_idx]);
         }
-        return common_prefix_length;
+        beams.resize(1);
     }
 
     // Min-heaps are used to efficiently collect the top-k elements (k=beam_width).
@@ -3071,49 +3069,78 @@ struct beam_search {
         }
     }
 
+    // Find common_prefix_length based on beams.
+    // Requires beams is not empty.
+    size_t find_common_prefix_length() {
+        size_t common_prefix_length = beams[0].tokens.size();
+        for (size_t i=1 ; i<beams.size() ; ++i) {
+            common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
+            for (size_t j=0 ; j<common_prefix_length ; ++j) {
+                if (beams[0].tokens[j] != beams[i].tokens[j]) {
+                    common_prefix_length = j;
+                    break;
+                }
+            }
+        }
+        return common_prefix_length;
+    }
+
+    // Construct beams_state to send back to caller via the callback function.
+    // Side effect: set common_prefix_length = find_common_prefix_length();
+    beams_state get_beams_state(bool const last_call) {
+        for (size_t i=0 ; i<beams.size() ; ++i) {
+            beam_lengths[i] = beams[i].tokens.size();
+            beam_ptrs[i] = beams[i].tokens.data();
+        }
+        common_prefix_length = find_common_prefix_length();
+        return {beams.size(), beam_lengths.data(), beam_ptrs.data(), common_prefix_length, last_call};
+    }
+
     // Loop:
-    //  * while i < n_predict, OR
-    //  * until all of the beams have nreached end-of-sentence, OR
-    //  * until the highest probability beam is at end-of-sentence
+    //  * while i < n_predict, AND
+    //  * any of the beams have not yet reached end-of-sentence, AND
+    //  * the highest probability beams (plural in case of ties) are not at end-of-sentence
     //    (since all other beam probabilities can only decrease)
     void loop(llama_beam_search_callback_fn_t const callback, void* const callback_state) {
         beams.push_back({{}, 1.0f});  // Start with one empty beam w/ probability = 1.0.
-        auto const eos = [](beam const& beam) { return beam.eos(); };
-        for (int i=0 ; i<n_predict && !std::all_of(beams.begin(),beams.end(),eos) && !eos(top_beam()) ; ++i) {
-            common_prefix_length = find_common_prefix_length();
-            llama_tokens_view const common_prefix{beams[0].tokens.data(), common_prefix_length};
-            callback(callback_state, common_prefix);
+        auto const not_eos = [](beam const& beam) { return !beam.eos(); };
+        for (int i=0 ; i<n_predict && std::any_of(beams.begin(),beams.end(),not_eos) &&
+                       !beams[top_beam_index()].eos() ; ++i) {
+            beam_search_control const control = callback(callback_state, get_beams_state(false));
+            if (control.collapse_to < beams.size()) {
+                // Caller has manually selected a specific beam. Collapse beams into it.
+                collapse_beams(control.collapse_to);
+            }
+            if (control.stop) {
+                break;
+            }
             common_prefix_evaluated = false;
             for (beam& beam : beams) {
                 fill_next_beams_by_top_probabilities(beam);
             }
             beams.swap(next_beams);
             renormalize_beam_probabilities(beams);
             std::for_each(next_beams.begin(), next_beams.end(), [](beam& beam) { beam.p = 0.0f; });
-#if 0 // DEBUG: print current beams for this iteration
-            std::cout << "\n\nCurrent beams:\n";
-            for (size_t j=0 ; j < beams.size() ; ++j) {
-                std::cout << "beams["<<j<<"]: " << ostream_beam{ctx,beams[j]} << std::endl;
-            }
-#endif
         }
-        beam& top_b = top_beam();
-        llama_tokens_view const top_beam_tokens{top_b.tokens.data(), top_b.tokens.size()};
-        callback(callback_state, top_beam_tokens);
+        collapse_beams(top_beam_index());
+        callback(callback_state, get_beams_state(true));
     }
 
     // As beams grow, the cumulative probabilities decrease.
     // Renormalize them to avoid floating point underflow.
     static void renormalize_beam_probabilities(std::vector<beam>& beams) {
         auto const sum_p = [](float sum, beam& beam) { return sum + beam.p; };
         float const inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
-        std::for_each(beams.begin(), beams.end(), [inv_sum](beam& beam) { beam.p *= inv_sum; });
+        std::for_each(beams.begin(), beams.end(), [=](beam& beam) { beam.p *= inv_sum; });
     }
 
-    // Return beam with highest probability.
-    beam& top_beam() {
-        auto const by_p = [](beam const& a, beam const& b) { return a.p < b.p; };
-        return *std::max_element(beams.begin(), beams.end(), by_p);
+    // Return index of highest ranking beam by (probability,eos()).
+    // In other words choose most probable beam. In case of ties, choose beam at end-of-sentence.
+    // Assumes beams is non-empty.
+    size_t top_beam_index() {
+        auto const by_p_and_eos = [](beam const& a, beam const& b) {
+            return a.p < b.p || (a.p == b.p && a.eos() < b.eos()); };
+        return std::max_element(beams.begin(), beams.end(), by_p_and_eos) - beams.begin();
     }
 };
 
@@ -3125,8 +3152,6 @@ void llama_beam_search(llama_context * ctx,
 
     beam_search beam_search(ctx, beam_width, n_past, n_predict, n_threads);
 
-    // callback(callback_state, common_beam_prefix) is called on each iteration, and when
-    // stop condition is met with remaining tokens from beam with the highest probability.
     beam_search.loop(callback, callback_state);
 
     ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
diff --git a/llama.h b/llama.h
@@ -460,14 +460,32 @@ extern "C" {
     /// @details Accepts the sampled token into the grammar
     LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
 
-    struct llama_tokens_view {
-        llama_token const* tokens;
-        size_t size;
+    // Passed to beam_search_callback function.
+    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
+    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
+    // These pointers are valid only during the synchronous callback, so should not be saved.
+    struct beams_state {
+        size_t n_beams;                   // Number of elements in beam_lengths[] and beams[].
+        size_t const* beam_lengths;       // Length of each beam.
+        llama_token const* const* beams;  // Current tokens in each beam.
+        size_t common_prefix_length;      // Current max length of prefix tokens shared by all beams.
+        bool last_call;                   // True iff this is the last callback invocation.
+    };
+    // Must be returned by beam_search_callback function.
+    struct beam_search_control {
+        size_t collapse_to;  // Collapse to a beam index.  Ignored if n_beams <= collapse_to.
+        bool stop;           // Stop beam search.  Set to false to continue.
     };
-    typedef void (*llama_beam_search_callback_fn_t)(void* state, llama_tokens_view const);
+    // Type of pointer to the beam_search_callback function.
+    // void* callback_state is any custom data passed to llama_beam_search, that is subsequently
+    // passed back to beam_search_callback. This avoids having to use global variables in the callback.
+    typedef beam_search_control (*llama_beam_search_callback_fn_t)(void* callback_state, beams_state);
 
     /// @details Deterministically returns entire sentence constructed by a beam search.
     /// @param ctx Pointer to the llama_context.
+    /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
+    ///                 The return beam_search_control can be used to control the beam_search execution.
+    /// @param callback_state A pointer that is passed back to callback and nothing more.
     /// @param beam_width The number of parallel beams to use.
     /// @param n_past The number of tokens already evaluated.
     /// @param n_predict The maximum number of tokens to predict.