Drop struct llama_beam_search_control. Instead, callback sends/receives data to/from beam_search via llama_beams_state. EOS determination is now responsibility of callback.

mattpulver · mattpulver · commit 7470edd62e47 · 2023-08-12T13:48:30.000-04:00
diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp
@@ -33,7 +33,7 @@ struct ostream_beam_view {
     llama_beam_view beam_view;
 };
 std::ostream& operator<<(std::ostream& os, ostream_beam_view const& obv) {
-    os << "p(" << obv.beam_view.p << ") eos(" << std::boolalpha << obv.beam_view.eos() << ") tokens(";
+    os << "p(" << obv.beam_view.p << ") eos(" << std::boolalpha << obv.beam_view.eos << ") tokens(";
     for (size_t i=0 ; i<obv.beam_view.n_tokens ; ++i) {
         os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
     }
@@ -46,14 +46,25 @@ struct beam_search_callback_state {
     std::vector<llama_token>* response;
 };
 
+bool is_at_eos(beam_search_callback_state, llama_token const* tokens, size_t const n_tokens) {
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos();
+}
+
 // Function matching type llama_beam_search_callback_fn_t.
 // Custom callback example is called each time the beams lengths increase:
 //  * Show progress by printing ',' following by number of convergent beam tokens if any.
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_state.
-llama_beam_search_control beam_search_callback(void* callback_state, llama_beams_state const beams_state) {
+void beam_search_callback(void* callback_state, llama_beams_state beams_state) {
     auto const state = *static_cast<beam_search_callback_state*>(callback_state);
+    // Mark beams as EOS as needed.
+    for (size_t i=0 ; i<beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eos && is_at_eos(state, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eos = true;
+        }
+    }
     printf(",");  // Show progress
     if (size_t const n = beams_state.common_prefix_length) {
         state.response->resize(state.response->size() + n);
@@ -69,10 +80,6 @@ llama_beam_search_control beam_search_callback(void* callback_state, llama_beams
         std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
     }
 #endif
-    return llama_beam_search_control{
-        beams_state.n_beams,  // = collapse_to. Any index out of range means do not collapse beams.
-        false                 // = stop. Don't stop beam search.
-    };
 }
 
 int main(int argc, char ** argv)
diff --git a/llama.cpp b/llama.cpp
@@ -47,6 +47,7 @@
 #include <algorithm>
 #include <initializer_list>
 #include <thread>
+#include <tuple>
 #include <atomic>
 #include <mutex>
 #include <sstream>
@@ -2893,13 +2894,17 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
 struct llama_beam {
     std::vector<llama_token> tokens;
     float p;  // Cumulative beam probability (renormalized relative to all beams)
-    // end-of-sentence
-    bool eos() const { return !tokens.empty() && tokens.back() == llama_token_eos(); }
+    bool eos; // Initialize end-of-sentence to false. Callback sets this to true.
+    // Sort beams by probability. In case of ties, prefer beams at eos.
+    bool operator<(llama_beam const& rhs) const {
+        return std::make_tuple(p, eos) < std::make_tuple(rhs.p, rhs.eos);
+    }
     // Shift off first n tokens and discard them.
     void shift_tokens(size_t const n) {
         std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
         tokens.resize(tokens.size() - n);
     }
+    llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eos}; }
 };
 
 // A struct for calculating logit-related info.
@@ -2961,7 +2966,7 @@ struct beam_search {
     // true iff llama_eval() has been called with non-empty common prefix in current loop iteration.
     bool common_prefix_evaluated;
 
-    // Temporary memory used by llama_beams_state to pass back via callback.
+    // Used to communicate to/from callback on beams state.
     std::vector<llama_beam_view> beam_views;
 
     beam_search(llama_context * ctx, size_t beam_width, int n_past, int n_predict, int n_threads)
@@ -2996,7 +3001,7 @@ struct beam_search {
             // with the common token prefix, so shift it off this beam.
             beam.shift_tokens(common_prefix_length);
         }
-        if (beam.eos()) {
+        if (beam.eos) {
             // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
             if (next_beams.size() < beam_width) {
                 next_beams.push_back(std::move(beam));
@@ -3071,7 +3076,7 @@ struct beam_search {
     // Side effect: set common_prefix_length = find_common_prefix_length();
     llama_beams_state get_beams_state(bool const last_call) {
         for (size_t i=0 ; i<beams.size() ; ++i) {
-            beam_views[i] = llama_beam_view{beams[i].tokens.data(), beams[i].tokens.size(), beams[i].p};
+            beam_views[i] = beams[i].view();
         }
         common_prefix_length = find_common_prefix_length();
         return {beam_views.data(), beams.size(), common_prefix_length, last_call};
@@ -3080,28 +3085,24 @@ struct beam_search {
     // Loop:
     //  * while i < n_predict, AND
     //  * any of the beams have not yet reached end-of-sentence, AND
-    //  * the highest probability beams (plural in case of ties) are not at end-of-sentence
+    //  * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
     //    (since all other beam probabilities can only decrease)
     void loop(llama_beam_search_callback_fn_t const callback, void* const callback_state) {
-        beams.push_back({{}, 1.0f});  // Start with one empty beam w/ probability = 1.0.
-        auto const not_eos = [](llama_beam const& beam) { return !beam.eos(); };
+        beams.push_back({{}, 1.0f, false});  // Start with one empty beam w/ probability = 1.0 and !eos.
+        auto const not_eos = [](llama_beam const& beam) { return !beam.eos; };
         for (int i=0 ; i<n_predict && std::any_of(beams.begin(),beams.end(),not_eos) &&
-                       !beams[top_beam_index()].eos() ; ++i) {
-            llama_beam_search_control const control = callback(callback_state, get_beams_state(false));
-            if (control.collapse_to < beams.size()) {
-                // Caller has manually selected a specific beam. Collapse beams into it.
-                collapse_beams(control.collapse_to);
-            }
-            if (control.stop) {
-                break;
-            }
-            common_prefix_evaluated = false;
+                       !beams[top_beam_index()].eos ; ++i) {
+            callback(callback_state, get_beams_state(false));
+            update_beams_from_beam_views();   // Update values (p,eos) that callback may have changed.
+            common_prefix_evaluated = false;  // Any common prefix has not yet been llama_eval()ed.
+            // Zero-out next_beam probabilities to place them last in following min-heap.
+            std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam& beam) { beam.p = 0.0f; });
             for (llama_beam& beam : beams) {
                 fill_next_beams_by_top_probabilities(beam);
             }
+            // next_beams become the beams of next/final iteration. Swap them to re-use memory.
             beams.swap(next_beams);
             renormalize_beam_probabilities(beams);
-            std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam& beam) { beam.p = 0.0f; });
         }
         collapse_beams(top_beam_index());
         callback(callback_state, get_beams_state(true));
@@ -3115,13 +3116,17 @@ struct beam_search {
         std::for_each(beams.begin(), beams.end(), [=](llama_beam& beam) { beam.p *= inv_sum; });
     }
 
-    // Return index of highest ranking beam by (probability,eos()).
-    // In other words choose most probable beam. In case of ties, choose beam at end-of-sentence.
-    // Assumes beams is non-empty.
+    // Assumes beams is non-empty.  Uses llama_beam::operator<() for ordering.
     size_t top_beam_index() {
-        auto const by_p_and_eos = [](llama_beam const& a, llama_beam const& b) {
-            return a.p < b.p || (a.p == b.p && a.eos() < b.eos()); };
-        return std::max_element(beams.begin(), beams.end(), by_p_and_eos) - beams.begin();
+        return std::max_element(beams.begin(), beams.end()) - beams.begin();
+    }
+
+    // Copy (p,eos) for each beam which may have been changed by the callback.
+    void update_beams_from_beam_views() {
+        for (size_t i=0 ; i<beams.size() ; ++i) {
+            beams[i].p = beam_views[i].p;
+            beams[i].eos = beam_views[i].eos;
+        }
     }
 };
 
diff --git a/llama.h b/llama.h
@@ -460,34 +460,27 @@ extern "C" {
     /// @details Accepts the sampled token into the grammar
     LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
 
-    // Lightweight view of a beam
     struct llama_beam_view {
         llama_token const* tokens;
         size_t n_tokens;
-        float p;  // Cumulative beam probability (renormalized relative to all beams)
-        // end-of-sentence
-        bool eos() const { return n_tokens && tokens[n_tokens-1u] == llama_token_eos(); }
+        float p;   // Cumulative beam probability (renormalized relative to all beams)
+        bool eos;  // Callback should set this to true when a beam is at end-of-sentence.
     };
 
     // Passed to beam_search_callback function.
     // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
     // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
     // These pointers are valid only during the synchronous callback, so should not be saved.
     struct llama_beams_state {
-        llama_beam_view* beam_views;  // View of each beam.
+        llama_beam_view* beam_views;
         size_t n_beams;               // Number of elements in beam_views[].
         size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
         bool last_call;               // True iff this is the last callback invocation.
     };
-    // Must be returned by beam_search_callback function.
-    struct llama_beam_search_control {
-        size_t collapse_to;  // Collapse to a beam index.  Ignored if n_beams <= collapse_to.
-        bool stop;           // Stop beam search.  Set to false to continue.
-    };
     // Type of pointer to the beam_search_callback function.
     // void* callback_state is any custom data passed to llama_beam_search, that is subsequently
     // passed back to beam_search_callback. This avoids having to use global variables in the callback.
-    typedef llama_beam_search_control (*llama_beam_search_callback_fn_t)(void* callback_state, llama_beams_state);
+    typedef void (*llama_beam_search_callback_fn_t)(void* callback_state, llama_beams_state);
 
     /// @details Deterministically returns entire sentence constructed by a beam search.
     /// @param ctx Pointer to the llama_context.