sampling: separate rng per sampling context

JohannesGaessler · JohannesGaessler · commit 123eaf054fc5 · 2024-04-23T13:21:13.000+02:00
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -1,4 +1,6 @@
+#define LLAMA_API_INTERNAL
 #include "sampling.h"
+#include <random>
 
 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
     struct llama_sampling_context * result = new llama_sampling_context();
@@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
 
     result->prev.resize(params.n_prev);
 
+    llama_sampling_set_rng_seed(result, LLAMA_DEFAULT_SEED);
+
     return result;
 }
 
@@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
     ctx->cur.clear();
 }
 
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        seed = time(NULL);
+    }
+    ctx->rng.seed(seed);
+}
+
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
     if (dst->grammar) {
         llama_grammar_free(dst->grammar);
@@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl(
 
             sampler_queue(ctx_main, params, cur_p, min_keep);
 
-            id = llama_sample_token(ctx_main, &cur_p);
+            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
 
             //{
             //    const int n_top = 10;
diff --git a/common/sampling.h b/common/sampling.h
@@ -4,9 +4,10 @@
 
 #include "grammar-parser.h"
 
+#include <random>
 #include <string>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
 // sampler types
 enum class llama_sampler_type : char {
@@ -79,6 +80,8 @@ struct llama_sampling_context {
     // TODO: replace with ring-buffer
     std::vector<llama_token>      prev;
     std::vector<llama_token_data> cur;
+
+    std::mt19937 rng;
 };
 
 #include "common.h"
@@ -93,6 +96,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
 // - reset grammar
 void llama_sampling_reset(llama_sampling_context * ctx);
 
+// Set the sampler seed
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
+
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
 
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
@@ -30,7 +30,6 @@ int main(int argc, char ** argv){
 
     // load the model
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
     GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
 
     // tokenize the prompt
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -38,7 +38,6 @@ int main(int argc, char ** argv){
 
     // load the model
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
     GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
 
     // tokenize the prompt
@@ -108,6 +107,7 @@ int main(int argc, char ** argv){
     bool has_eos = false;
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    llama_sampling_set_rng_seed(ctx_sampling, params.seed);
 
     std::vector<llama_token> draft;
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -240,7 +240,6 @@ int main(int argc, char ** argv) {
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
-            llama_set_rng_seed(ctx, params.seed);
             LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
         }
     }
@@ -521,6 +520,7 @@ int main(int argc, char ** argv) {
     }
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
+    llama_sampling_set_rng_seed(ctx_sampling, params.seed);
 
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1028,7 +1028,7 @@ struct server_context {
                 send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                 return false;
             }
-            llama_set_rng_seed(ctx, slot.params.seed);
+            llama_sampling_set_rng_seed(slot.ctx_sampling, slot.params.seed);
         }
 
         slot.command = SLOT_COMMAND_LOAD_PROMPT;
diff --git a/llama.cpp b/llama.cpp
@@ -13478,7 +13478,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
     return result;
 }
 
-llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
+llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
     GGML_ASSERT(ctx);
 
     const int64_t t_start_sample_us = ggml_time_us();
@@ -13491,7 +13491,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
     }
 
     std::discrete_distribution<> dist(probs.begin(), probs.end());
-    auto & rng = ctx->rng;
     int idx = dist(rng);
 
     llama_token result = candidates->data[idx].id;
@@ -13501,6 +13500,10 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
     return result;
 }
 
+llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
+    return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
+}
+
 void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
     const int64_t t_start_sample_us = ggml_time_us();
 
diff --git a/llama.h b/llama.h
@@ -987,7 +987,7 @@ extern "C" {
             struct llama_context * ctx,
           llama_token_data_array * candidates);
 
-    /// @details Randomly selects a token from the candidates based on their probabilities.
+    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
     LLAMA_API llama_token llama_sample_token(
             struct llama_context * ctx,
           llama_token_data_array * candidates);
@@ -1074,8 +1074,9 @@ extern "C" {
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 
-#include <vector>
+#include <random>
 #include <string>
+#include <vector>
 
 struct ggml_tensor;
 
@@ -1112,6 +1113,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         const std::string & src,
         llama_partial_utf8   partial_start);
 
+// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
+// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
+llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
+
 #endif // LLAMA_API_INTERNAL
 
 #endif // LLAMA_H

Original file line number	Diff line number	Diff line change
`@@ -240,7 +240,6 @@ int main(int argc, char ** argv) {`
`240`	`240`	`return 1;`
`241`	`241`	`}`
`242`	`242`	`session_tokens.resize(n_token_count_out);`
`243`		`- llama_set_rng_seed(ctx, params.seed);`
`244`	`243`	`LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());`
`245`	`244`	`}`
`246`	`245`	`}`
`@@ -521,6 +520,7 @@ int main(int argc, char ** argv) {`
`521`	`520`	`}`
`522`	`521`
`523`	`522`	`struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);`
	`523`	`+ llama_sampling_set_rng_seed(ctx_sampling, params.seed);`
`524`	`524`
`525`	`525`	`while ((n_remain != 0 && !is_antiprompt) \|\| params.interactive) {`
`526`	`526`	`// predict`
Original file line number	Diff line number	Diff line change
`@@ -1028,7 +1028,7 @@ struct server_context {`
`1028`	`1028`	`send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);`
`1029`	`1029`	`return false;`
`1030`	`1030`	`}`
`1031`		`- llama_set_rng_seed(ctx, slot.params.seed);`
	`1031`	`+ llama_sampling_set_rng_seed(slot.ctx_sampling, slot.params.seed);`
`1032`	`1032`	`}`
`1033`	`1033`
`1034`	`1034`	`slot.command = SLOT_COMMAND_LOAD_PROMPT;`