Cleanup: Change beam_width from type int to size_t.

mattpulver · mattpulver · commit 4e702e6a2813 · 2023-08-06T12:55:39.000-04:00
diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp
@@ -140,8 +140,9 @@ int main(int argc, char ** argv)
     n_past += tokens_list.size();
 
     std::vector<llama_token> response;
+    size_t const beam_width = static_cast<size_t>(params.n_beams);
     int const n_predict = 256;
-    llama_beam_search(ctx, beam_search_callback, &response, params.n_beams, n_past, n_predict, params.n_threads);
+    llama_beam_search(ctx, beam_search_callback, &response, beam_width, n_past, n_predict, params.n_threads);
 
     printf("\n\n");
     for (llama_token const token_id : response) {
diff --git a/llama.cpp b/llama.cpp
@@ -2919,21 +2919,21 @@ struct logit_info {
       , max_l(*std::max_element(logits, logits + n_vocab))
       , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
       { }
-    llama_token_data get_token_data(int const token_id) const {
+    llama_token_data get_token_data(llama_token const token_id) const {
         constexpr auto p = std::numeric_limits<float>::quiet_NaN();  // never used
         return {token_id, logits[token_id], p};
     }
     // Return top k token_data by logit.
-    std::vector<llama_token_data> top_k(int k) {
+    std::vector<llama_token_data> top_k(size_t k) {
         std::vector<llama_token_data> min_heap;  // min-heap by logit
-        k = std::min(k, n_vocab);
-        min_heap.reserve(k);
-        for (int token_id=0 ; token_id<k ; ++token_id) {
+        llama_token const k_min = std::min(static_cast<llama_token>(k), n_vocab);
+        min_heap.reserve(k_min);
+        for (llama_token token_id=0 ; token_id<k_min ; ++token_id) {
             min_heap.push_back(get_token_data(token_id));
         }
         auto comp = [](llama_token_data const& a, llama_token_data const& b) { return a.logit > b.logit; };
         std::make_heap(min_heap.begin(), min_heap.end(), comp);
-        for (int token_id=k ; token_id<n_vocab ; ++token_id) {
+        for (llama_token token_id=k_min ; token_id<n_vocab ; ++token_id) {
             if (min_heap.front().logit < logits[token_id]) {
                 std::pop_heap(min_heap.begin(), min_heap.end(), comp);
                 min_heap.back().id = token_id;
@@ -2950,7 +2950,7 @@ struct logit_info {
 
 struct beam_search {
     llama_context * ctx;
-    int beam_width;
+    size_t beam_width;
     int n_past;
     int n_predict;
     int n_threads;
@@ -2966,7 +2966,7 @@ struct beam_search {
     std::vector<size_t> beam_lengths;
     std::vector<llama_token const*> beam_ptrs;
 
-    beam_search(llama_context * ctx, int beam_width, int n_past, int n_predict, int n_threads)
+    beam_search(llama_context * ctx, size_t beam_width, int n_past, int n_predict, int n_threads)
       : ctx(ctx)
       , beam_width(beam_width)
       , n_past(n_past)
@@ -3001,9 +3001,9 @@ struct beam_search {
         }
         if (b.eos()) {
             // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
-            if (next_beams.size() < static_cast<size_t>(beam_width)) {
+            if (next_beams.size() < beam_width) {
                 next_beams.push_back(std::move(b));
-                if (next_beams.size() == static_cast<size_t>(beam_width)) {
+                if (next_beams.size() == beam_width) {
                     std::make_heap(next_beams.begin(), next_beams.end(), comp);
                 }
             } else if (next_beams.front().p < b.p) {
@@ -3023,9 +3023,9 @@ struct beam_search {
             }
             logit_info logit_info(ctx);
             std::vector<llama_token_data> next_tokens = logit_info.top_k(beam_width);
-            int i=0;
-            if (next_beams.size() < static_cast<size_t>(beam_width)) {
-                for (; next_beams.size() < static_cast<size_t>(beam_width) ; ++i) {
+            size_t i=0;
+            if (next_beams.size() < beam_width) {
+                for (; next_beams.size() < beam_width ; ++i) {
                     beam next_beam = b;
                     next_beam.tokens.push_back(next_tokens[i].id);
                     next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
@@ -3131,7 +3131,7 @@ struct beam_search {
 
 void llama_beam_search(llama_context * ctx,
                        llama_beam_search_callback_fn_t callback, void* callback_state,
-                       int beam_width, int n_past, int const n_predict, int const n_threads) {
+                       size_t beam_width, int n_past, int n_predict, int n_threads) {
     assert(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
 
diff --git a/llama.h b/llama.h
@@ -473,7 +473,7 @@ extern "C" {
     /// @param n_past The number of tokens already evaluated.
     /// @param n_predict The maximum number of tokens to predict.
     /// @param n_threads The maximum number of threads as passed to llama_eval().
-    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void* callback_state, int beam_width, int n_past, int n_predict, int n_threads);
+    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void* callback_state, size_t beam_width, int n_past, int n_predict, int n_threads);
 
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);