Drop unnecessary use of std::vector<llama_token> common_prefix.

mattpulver · mattpulver · commit ef7850d88735 · 2023-08-16T10:56:15.000-04:00
diff --git a/llama.cpp b/llama.cpp
@@ -2898,25 +2898,24 @@ struct beam {
     float p;  // Cumulative beam probability (renormalized with each token)
     // end-of-sentence
     bool eos() const { return !tokens.empty() && tokens.back() == llama_token_eos(); }
-    // Shift off first n tokens to the end of dest.
-    void shift_tokens(std::vector<llama_token>& dest, int const n) {
-        dest.resize(dest.size() + n);
-        std::copy(tokens.begin(), tokens.begin() + n, dest.end() - n);
-        shift_tokens(n);
-    }
     // Shift off first n tokens and discard them.
-    void shift_tokens(int const n) {
+    void shift_tokens(size_t const n) {
         std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
         tokens.resize(tokens.size() - n);
     }
 };
 
-void out_beam(std::ostream& os, llama_context* ctx, beam const& beam) {
-    os << "p(" << beam.p << ") eos(" << std::boolalpha << beam.eos() << ") tokens(";
-    for (llama_token const token_id : beam.tokens) {
-        os << llama_token_to_str(ctx, token_id);
+// Used for debugging to print out beam tokens.
+struct ostream_beam {
+    llama_context* ctx;
+    beam& b;
+};
+std::ostream& operator<<(std::ostream& os, ostream_beam const& osb) {
+    os << "p(" << osb.b.p << ") eos(" << std::boolalpha << osb.b.eos() << ") tokens(";
+    for (llama_token const token_id : osb.b.tokens) {
+        os << llama_token_to_str(osb.ctx, token_id);
     }
-    os << ')';
+    return os << ')';
 }
 
 // A struct for calculating logit-related info.
@@ -2974,11 +2973,9 @@ struct beam_search {
     std::vector<beam> next_beams;
 
     // Re-calculated on each loop iteration
-    int common_prefix_length;
-    // true iff llama_eval() has been called with common prefix in current loop iteration.
+    size_t common_prefix_length;
+    // true iff llama_eval() has been called with non-empty common prefix in current loop iteration.
     bool common_prefix_evaluated;
-    // Save token prefix common to all beams. Cleared after each loop iteration.
-    std::vector<llama_token> common_prefix;
 
     beam_search(llama_context * ctx, int beam_width, int n_past, int n_predict, int n_threads)
       : ctx(ctx)
@@ -2992,13 +2989,14 @@ struct beam_search {
 
     // Find common_prefix_length based on beams.
     // Requires beams is not empty.
-    int find_common_prefix_length() {
-        int common_prefix_length = int(beams[0].tokens.size());
-        for (int i=1 ; i<int(beams.size()) ; ++i) {
-            int const j_max = std::min(common_prefix_length, int(beams[i].tokens.size()));
-            for (int j=0 ; j<j_max ; ++j) {
+    size_t find_common_prefix_length() {
+        size_t common_prefix_length = beams[0].tokens.size();
+        for (size_t i=1 ; i<beams.size() ; ++i) {
+            common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
+            for (size_t j=0 ; j<common_prefix_length ; ++j) {
                 if (beams[0].tokens[j] != beams[i].tokens[j]) {
-                    return j;
+                    common_prefix_length = j;
+                    break;
                 }
             }
         }
@@ -3035,7 +3033,7 @@ struct beam_search {
             if (!b.tokens.empty()) {
                 llama_eval(ctx, b.tokens.data(), b.tokens.size(), n_past, n_threads);
                 if (!common_prefix_evaluated && common_prefix_length) {
-                    b.shift_tokens(common_prefix, common_prefix_length);
+                    b.shift_tokens(common_prefix_length);
                     n_past += common_prefix_length;
                     common_prefix_evaluated = true;
                 }
@@ -3082,22 +3080,26 @@ struct beam_search {
         beams.push_back({{}, 1.0f});  // Start with one empty beam w/ probability = 1.0.
         auto const eos = [](beam const& beam) { return beam.eos(); };
         for (int i=0 ; i<n_predict && !std::all_of(beams.begin(),beams.end(),eos) && !eos(top_beam()) ; ++i) {
-            common_prefix_evaluated = false;
             common_prefix_length = find_common_prefix_length();
+            llama_tokens_view const common_prefix{beams[0].tokens.data(), common_prefix_length};
+            callback(callback_state, common_prefix);
+            common_prefix_evaluated = false;
             for (beam& beam : beams) {
                 fill_next_beams_by_top_probabilities(beam);
             }
             beams.swap(next_beams);
             renormalize_beam_probabilities(beams);
             std::for_each(next_beams.begin(), next_beams.end(), [](beam& beam) { beam.p = 0.0f; });
-            llama_tokens_view const common_beam_prefix{common_prefix.data(), common_prefix.size()};
-            callback(callback_state, common_beam_prefix);
-            common_prefix.clear();
+#if 0 // DEBUG: print current beams for this iteration
+            std::cout << "\n\nCurrent beams:\n";
+            for (size_t j=0 ; j < beams.size() ; ++j) {
+                std::cout << "beams["<<j<<"]: " << ostream_beam{ctx,beams[j]} << std::endl;
+            }
+#endif
         }
         beam& top_b = top_beam();
-        top_b.shift_tokens(common_prefix, top_b.tokens.size());
-        llama_tokens_view const common_beam_prefix{common_prefix.data(), common_prefix.size()};
-        callback(callback_state, common_beam_prefix);
+        llama_tokens_view const top_beam_tokens{top_b.tokens.data(), top_b.tokens.size()};
+        callback(callback_state, top_beam_tokens);
     }
 
     // As beams grow, the cumulative probabilities decrease.