ggml-org
diff --git a/‎common/sampling.cpp
Lines changed: 117 additions & 184 deletions b/‎common/sampling.cpp
Lines changed: 117 additions & 184 deletions
diff --git a/‎common/sampling.h
Lines changed: 15 additions & 117 deletions b/‎common/sampling.h
Lines changed: 15 additions & 117 deletions
diff --git a/‎examples/batched.swift/Sources/main.swift
Lines changed: 9 additions & 8 deletions b/‎examples/batched.swift/Sources/main.swift
Lines changed: 9 additions & 8 deletions
diff --git a/‎examples/batched/batched.cpp
Lines changed: 10 additions & 8 deletions b/‎examples/batched/batched.cpp
Lines changed: 10 additions & 8 deletions
diff --git a/‎examples/server/server.cpp
Lines changed: 4 additions & 3 deletions b/‎examples/server/server.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/speculative/speculative.cpp
Lines changed: 2 additions & 2 deletions b/‎examples/speculative/speculative.cpp
Lines changed: 2 additions & 2 deletions
@@ -4,7 +4,6 @@
 
 #include <string>
 #include <vector>
-#include <stdexcept>
 
 // sampler types
 enum class llama_sampler_type : char {
@@ -59,119 +58,16 @@ typedef struct gpt_sampling_params {
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
 } gpt_sampling_params;
 
-// the ring buffer works similarly to std::deque, but with a fixed capacity
-template<typename T>
-struct ring_buffer {
-    ring_buffer() {}
-    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
-
-    T & front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    const T & front() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    T & back() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    const T & back() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    void push_back(const T & value) {
-        if (sz == capacity) {
-            // advance the start when buffer is full
-            first = (first + 1) % capacity;
-        } else {
-            sz++;
-        }
-        data[pos] = value;
-        pos = (pos + 1) % capacity;
-    }
-
-    T pop_front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        T value = data[first];
-        first = (first + 1) % capacity;
-        sz--;
-        return value;
-    }
-
-    T & operator[](size_t i) {
-        if (i >= sz) {
-            throw std::runtime_error("ring buffer: index out of bounds");
-        }
-        return data[(first + i) % capacity];
-    }
-
-    const T & operator[](size_t i) const {
-        if (i >= sz) {
-            throw std::runtime_error("ring buffer: index out of bounds");
-        }
-        return data[(first + i) % capacity];
-    }
-
-    std::vector<T> to_vector() const {
-        std::vector<T> result;
-        result.reserve(sz);
-        for (size_t i = 0; i < sz; i++) {
-            result.push_back(data[(first + i) % capacity]);
-        }
-        return result;
-    }
-
-    void clear() {
-        // here only reset the status of the buffer
-        sz = 0;
-        first = 0;
-        pos = 0;
-    }
-
-    bool empty() const {
-        return sz == 0;
-    }
-
-    size_t size() const {
-        return sz;
-    }
-
-    size_t capacity = 0;
-    size_t sz = 0;
-    size_t first = 0;
-    size_t pos = 0;
-    std::vector<T> data;
-};
-
 // general sampler context
 // TODO: move to llama.h
 struct llama_sampling_context {
     // parameters that will be used for sampling
     gpt_sampling_params params;
 
-    // mirostat sampler state
-    float mirostat_mu;
-
     llama_sampling * smpl;
 
-    ring_buffer<llama_token>      prev;
     std::vector<llama_token_data> cur;
+    std::vector<llama_token_data> org;
 
     size_t n_valid; // Number of correct top tokens with correct probabilities.
 };
@@ -189,10 +85,10 @@ void llama_sampling_reset(llama_sampling_context * ctx);
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
 
-// Get the last sampled token
+// Get the last accepted token
 llama_token llama_sampling_last(llama_sampling_context * ctx);
 
-// Get a string representation of the last sampled tokens
+// Get a string representation of the last accepted tokens
 std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
 
 // Print sampling parameters into a string
@@ -206,6 +102,13 @@ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
 std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
 
+// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
+llama_token_data_array llama_sampling_prepare(
+        struct llama_sampling_context * ctx_sampling,
+        struct llama_context * ctx_main,
+        struct llama_context * ctx_cfg,
+        int idx = 0);
+
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
@@ -223,20 +126,15 @@ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::strin
 //  - token:      sampled token
 //  - candidates: vector of candidate tokens
 //
-llama_token llama_sampling_sample(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = -1);
+//llama_token llama_sampling_sample(
+//        struct llama_sampling_context * ctx_sampling,
+//        struct llama_token_data_array * cur_p);
 
-// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
-llama_token_data_array llama_sampling_prepare(
+llama_token llama_sampling_sample(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
         struct llama_context * ctx_cfg,
-        int idx = 0,
-        bool apply_grammar = true,
-        std::vector<float> * original_logits = nullptr);
+        int idx = 0);
 
 void llama_sampling_accept(
         struct llama_sampling_context * ctx_sampling,
 
@@ -50,7 +50,12 @@ defer {
     llama_free(context)
 }
 
-let smpl = llama_sampling_init(model, llama_sampling_default_params())
+var sparams = llama_sampling_params()
+sparams.top_k = 40
+sparams.top_p = 0.9
+sparams.temp  = 0.4
+
+let smpl = llama_sampling_init(model, sparams)
 guard smpl != nil else {
     print("Failed to initialize sampling")
     exit(1)
@@ -146,13 +151,9 @@ while n_cur <= n_len {
             sorted: false
         )
 
-        let top_k: Int32 = 40
-        let top_p: Float = 0.9
-        let temp: Float = 0.4
-
-        llama_sampling_top_k(smpl, &candidates_p, top_k, 1)
-        llama_sampling_top_p(smpl, &candidates_p, top_p, 1)
-        llama_sampling_temp(smpl, &candidates_p, temp)
+        llama_sampling_top_k(smpl, &candidates_p)
+        llama_sampling_top_p(smpl, &candidates_p)
+        llama_sampling_temp (smpl, &candidates_p)
 
         let new_token_id = llama_sampling_sample(smpl, &candidates_p)
 
 
@@ -64,7 +64,13 @@ int main(int argc, char ** argv) {
     ctx_params.n_batch = std::max(n_predict, n_parallel);
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());
+
+    auto sparams = llama_sampling_default_params();
+    sparams.top_k = 40;
+    sparams.top_p = 0.9f;
+    sparams.temp  = 0.4f;
+
+    llama_sampling * smpl = llama_sampling_init(model, sparams);
 
     if (ctx == NULL) {
         fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@@ -177,13 +183,9 @@ int main(int argc, char ** argv) {
 
             llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
-            const int   top_k = 40;
-            const float top_p = 0.9f;
-            const float temp  = 0.4f;
-
-            llama_sampling_top_k(smpl, &candidates_p, top_k, 1);
-            llama_sampling_top_p(smpl, &candidates_p, top_p, 1);
-            llama_sampling_temp (smpl, &candidates_p, temp);
+            llama_sampling_top_k(smpl, &candidates_p);
+            llama_sampling_top_p(smpl, &candidates_p);
+            llama_sampling_temp (smpl, &candidates_p);
 
             const llama_token new_token_id = llama_sampling_sample(smpl, &candidates_p);
 
 
@@ -2360,9 +2360,10 @@ struct server_context {
                     const size_t n_valid = slot.ctx_sampling->n_valid;
 
                     // Make sure at least n_probs top tokens are at the front of the vector:
-                    if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
-                        llama_sampling_top_k(slot.ctx_sampling->smpl, &cur_p, n_probs, 0);
-                    }
+                    // TODO: decide to how to handle this after the refactoring
+                    //if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
+                    //    llama_sampling_top_k(slot.ctx_sampling->smpl, &cur_p, n_probs, 0);
+                    //}
 
                     if (slot.sparams.temp == 0.0f) {
                         // With greedy sampling the probabilities have possibly not been calculated.
 
@@ -181,7 +181,6 @@ int main(int argc, char ** argv) {
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
 
-    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
     if (params.sparams.temp == 0) {
         params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
     }
@@ -231,7 +230,8 @@ int main(int argc, char ** argv) {
                 if (params.sparams.temp > 0) {
                     // stochastic verification
 
-                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
+                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+                    llama_sampling_grammar(ctx_sampling->smpl, &dist_tgt);
                     llama_sampling_softmax(ctx_sampling->smpl, &dist_tgt);
 
                     float p_tgt = 0.0f;