Rename beam_width to n_beams for consistency with existing convention.

mattpulver · mattpulver · commit 3359308a0179 · 2023-08-16T10:56:15.000-04:00
diff --git a/examples/common.h b/examples/common.h
@@ -29,7 +29,7 @@ struct gpt_params {
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t n_beams                         = 0;    // Used in mem allocation if > 0 and by llama_beam_search().
+    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
     float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
     float   rope_freq_base                  = 10000.0f; // RoPE base frequency
     float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
diff --git a/llama.cpp b/llama.cpp
@@ -2954,7 +2954,7 @@ struct logit_info {
 
 struct beam_search {
     llama_context * ctx;
-    size_t beam_width;
+    size_t n_beams;
     int n_past;
     int n_predict;
     int n_threads;
@@ -2969,15 +2969,15 @@ struct beam_search {
     // Used to communicate to/from callback on beams state.
     std::vector<llama_beam_view> beam_views;
 
-    beam_search(llama_context * ctx, size_t beam_width, int n_past, int n_predict, int n_threads)
+    beam_search(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
       : ctx(ctx)
-      , beam_width(beam_width)
+      , n_beams(n_beams)
       , n_past(n_past)
       , n_predict(n_predict)
       , n_threads(n_threads)
-      , beam_views(beam_width) {
-        beams.reserve(beam_width);
-        next_beams.reserve(beam_width);
+      , beam_views(n_beams) {
+        beams.reserve(n_beams);
+        next_beams.reserve(n_beams);
     }
 
     // Collapse beams to a single beam given by index.
@@ -2988,7 +2988,7 @@ struct beam_search {
         beams.resize(1);
     }
 
-    // Min-heaps are used to efficiently collect the top-k elements (k=beam_width).
+    // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
     // The repetative patterns below reflect the 2 stages of heaps:
     //  * Gather elements until the vector is full, then call std::make_heap() on it.
     //  * If the heap is full and a new element is found that should be included, pop the
@@ -3003,9 +3003,9 @@ struct beam_search {
         }
         if (beam.eos) {
             // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
-            if (next_beams.size() < beam_width) {
+            if (next_beams.size() < n_beams) {
                 next_beams.push_back(std::move(beam));
-                if (next_beams.size() == beam_width) {
+                if (next_beams.size() == n_beams) {
                     std::make_heap(next_beams.begin(), next_beams.end(), comp);
                 }
             } else if (next_beams.front().p < beam.p) {
@@ -3024,10 +3024,10 @@ struct beam_search {
                 }
             }
             logit_info logit_info(ctx);
-            std::vector<llama_token_data> next_tokens = logit_info.top_k(beam_width);
+            std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
             size_t i=0;
-            if (next_beams.size() < beam_width) {
-                for (; next_beams.size() < beam_width ; ++i) {
+            if (next_beams.size() < n_beams) {
+                for (; next_beams.size() < n_beams ; ++i) {
                     llama_beam next_beam = beam;
                     next_beam.tokens.push_back(next_tokens[i].id);
                     next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
@@ -3043,7 +3043,7 @@ struct beam_search {
                     std::push_heap(next_beams.begin(), next_beams.end(), comp);
                 }
             }
-            for (; i < beam_width ; ++i) {
+            for (; i < n_beams ; ++i) {
                 float const next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
                 if (next_beams.front().p < next_p) {
                     std::pop_heap(next_beams.begin(), next_beams.end(), comp);
@@ -3076,7 +3076,9 @@ struct beam_search {
     // Side effect: set common_prefix_length = find_common_prefix_length();
     llama_beams_state get_beams_state(bool const last_call) {
         for (size_t i=0 ; i<beams.size() ; ++i) {
-            beam_views[i] = beams[i].view();
+            //beam_views[i] = beams[i].view();
+            auto view = beams.at(i).view();
+            beam_views.at(i) = view;  // capacity 0
         }
         common_prefix_length = find_common_prefix_length();
         return {beam_views.data(), beams.size(), common_prefix_length, last_call};
@@ -3132,11 +3134,11 @@ struct beam_search {
 
 void llama_beam_search(llama_context * ctx,
                        llama_beam_search_callback_fn_t callback, void* callback_state,
-                       size_t beam_width, int n_past, int n_predict, int n_threads) {
+                       size_t n_beams, int n_past, int n_predict, int n_threads) {
     assert(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
 
-    beam_search beam_search(ctx, beam_width, n_past, n_predict, n_threads);
+    beam_search beam_search(ctx, n_beams, n_past, n_predict, n_threads);
 
     beam_search.loop(callback, callback_state);
 
diff --git a/llama.h b/llama.h
@@ -487,11 +487,11 @@ extern "C" {
     /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
     ///                 The return beam_search_control can be used to control the beam_search execution.
     /// @param callback_state A pointer that is simply passed back to callback.
-    /// @param beam_width The number of parallel beams to use.
-    /// @param n_past The number of tokens already evaluated.
-    /// @param n_predict The maximum number of tokens to predict.
-    /// @param n_threads The maximum number of threads as passed to llama_eval().
-    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void* callback_state, size_t beam_width, int n_past, int n_predict, int n_threads);
+    /// @param n_beams Number of beams to use.
+    /// @param n_past Number of tokens already evaluated.
+    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
+    /// @param n_threads Number of threads as passed to llama_eval().
+    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void* callback_state, size_t n_beams, int n_past, int n_predict, int n_threads);
 
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);