Remove llama_context* ctx member from beam struct.

mattpulver · mattpulver · commit ac7758377816 · 2023-07-31T11:08:45.000-04:00
diff --git a/llama.cpp b/llama.cpp
@@ -2878,20 +2878,18 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
 }
 
 struct beam {
-  llama_context* ctx;
   std::vector<llama_token> tokens;
   float p;  // Cumulative beam probability (renormalized with each token)
   // end-of-sentence
   bool eos() const { return !tokens.empty() && tokens.back() == llama_token_eos(); }
 };
 
-std::ostream& operator<<(std::ostream& os, beam const& b) {
-  os << "ctx(" << static_cast<void*>(b.ctx) << ") p(" << b.p
-     << ") eos(" << std::boolalpha << b.eos() << ") tokens(";
-  for (auto const token_id : b.tokens) {
-    os << llama_token_to_str(b.ctx, token_id);
+void out_beam(std::ostream& os, llama_context* ctx, beam const& b) {
+  os << "p(" << b.p << ") eos(" << std::boolalpha << b.eos() << ") tokens(";
+  for (llama_token const token_id : b.tokens) {
+    os << llama_token_to_str(ctx, token_id);
   }
-  return os << ')';
+  os << ')';
 }
 
 // A struct for calculating logit-related info.
@@ -2938,8 +2936,8 @@ struct logit_info {
   }
 };
 
-void fill_next_beams_by_top_probabilities(std::vector<beam>& next_beams, beam const& b,
-    int const beam_width, int const n_past, int const n_threads) {
+void fill_next_beams_by_top_probabilities(llama_context* ctx, std::vector<beam>& next_beams,
+    beam const& b, int const beam_width, int const n_past, int const n_threads) {
   auto const comp = [](beam const& a, beam const& b) { return a.p > b.p; };
   if (b.eos()) {
     // b is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
@@ -2956,9 +2954,9 @@ void fill_next_beams_by_top_probabilities(std::vector<beam>& next_beams, beam co
   } else {
     // b is not at end-of-sentence, so branch with next top_k tokens.
     if (!b.tokens.empty()) {
-      llama_eval(b.ctx, b.tokens.data(), b.tokens.size(), n_past, n_threads);
+      llama_eval(ctx, b.tokens.data(), b.tokens.size(), n_past, n_threads);
     }
-    logit_info li(b.ctx);
+    logit_info li(ctx);
     std::vector<llama_token_data> next_tokens = li.top_k(beam_width);
     int i=0;
     if (next_beams.size() < static_cast<size_t>(beam_width)) {
@@ -3001,15 +2999,14 @@ beam const& top_beam(std::vector<beam> const& beams) {
 // fill_next_beams_by_top_probabilities() by randomly selecting from all next_beams.
 // Not thread-safe.
 const char* llama_beam_search(llama_context * ctx, int const beam_width,
-                              int const n_past, int const n_predict, int const n_threads) {
+                              int n_past, int const n_predict, int const n_threads) {
     static std::string beam_search_response;
     assert(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
 
     std::vector<beam> beams;
     beams.reserve(beam_width);
-    beams.push_back({ctx, {}, 1.0});
-    // Init next_beams with unique next token_id each.
+    beams.push_back({{}, 1.0});
     std::vector<beam> next_beams;
     next_beams.reserve(beam_width);
     // Loop while there are any beams that have not yet reached end-of-sentence.
@@ -3019,35 +3016,28 @@ const char* llama_beam_search(llama_context * ctx, int const beam_width,
     for (int i=0 ; i<n_predict && !eos(top_beam(beams)) &&
          !std::all_of(beams.begin(), beams.end(), eos); ++i) {
       for (beam& b : beams) {
-        fill_next_beams_by_top_probabilities(next_beams, b, beam_width, n_past, n_threads);
+        fill_next_beams_by_top_probabilities(ctx, next_beams, b, beam_width, n_past, n_threads);
       }
       beams.swap(next_beams);
       next_beams.clear();
       renormalize_beam_probabilities(beams);
 #if 1 // DEBUG: print current beams for this iteration
       std::cout << "\n\nCurrent beams:\n";
       for (size_t j=0 ; j < beams.size() ; ++j) {
-        std::cout << "beams["<<j<<"]: " << beams[j] << std::endl;;
+        std::cout << "beams["<<j<<"]: ";
+        out_beam(std::cout, ctx, beams[j]);
+        std::cout << std::endl;
       }
 #else
       std::cout << '.' << std::flush;  // Show progress
 #endif
     }
-#if 1 // DEBUG: print final beam results
-    for (size_t i=0 ; i<beams.size() ; ++i) {
-      std::cout << "\nbeams["<<i<<"] with p(" << beams[i].p << "): ";
-      for (llama_token const token : beams[i].tokens) {
-        std::cout << llama_token_to_str(beams[i].ctx, token);
-      }
-      std::cout << std::endl;
-    }
-#endif
 
     beam const& top_b = top_beam(beams);
     // Save beam sentence to beam_search_response. Is there a better way?
     std::ostringstream oss;
     for (llama_token const token : top_b.tokens) {
-      oss << llama_token_to_str(top_b.ctx, token);
+      oss << llama_token_to_str(ctx, token);
     }
     beam_search_response = oss.str();