ggml-org
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/common.cpp
Lines changed: 28 additions & 0 deletions b/‎examples/common.cpp
Lines changed: 28 additions & 0 deletions
diff --git a/‎examples/main/main.cpp
Lines changed: 54 additions & 8 deletions b/‎examples/main/main.cpp
Lines changed: 54 additions & 8 deletions
@@ -76,7 +76,7 @@ option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 # Compile flags
 #
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 
@@ -35,7 +35,7 @@ endif
 
 # keep standard at C11 and C++11
 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++20 -fPIC
 LDFLAGS  =
 
 # warnings
 
@@ -114,6 +114,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.temp = std::stof(argv[i]);
+        } else if (arg == "--tfs") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.tfs_z = std::stof(argv[i]);
+        } else if (arg == "--typical") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.typical_p = std::stof(argv[i]);
         } else if (arg == "--repeat_last_n") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -126,6 +138,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.repeat_penalty = std::stof(argv[i]);
+        } else if (arg == "--alpha_frequency") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.alpha_frequency = std::stof(argv[i]);
+        } else if (arg == "--alpha_presence") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.alpha_presence = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch_size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -242,6 +266,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", (double)params.top_p);
+    fprintf(stderr, "  --tfs N               tail free sampling (default: %.1f)\n", (double)params.tfs_z);
+    fprintf(stderr, "  --typical N           locally typical sampling (default: %.1f)\n", (double)params.typical_p);
+    fprintf(stderr, "  --alpha_presence N    repeat alpha presence (default: %d)\n", params.alpha_presence);
+    fprintf(stderr, "  --alpha_frequency N   repeat alpha frequency (default: %.1f)\n", (double)params.alpha_frequency);
     fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
     fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
     fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
 
@@ -276,8 +276,8 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
         }
     }
-    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
-        params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, alpha_presence = %f, alpha_frequency = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f\n",
+        params.repeat_last_n, params.repeat_penalty, params.alpha_presence, params.alpha_frequency, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp);
     fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     fprintf(stderr, "\n\n");
 
@@ -387,10 +387,15 @@ int main(int argc, char ** argv) {
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
-            const int32_t top_k          = params.top_k;
-            const float   top_p          = params.top_p;
             const float   temp           = params.temp;
+            const int32_t top_k          = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p          = params.top_p;
+            const float   tfs_z          = params.tfs_z;
+            const float   typical_p      = params.typical_p;
+            const int32_t repeat_last_n  = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
             const float   repeat_penalty = params.repeat_penalty;
+            const float   alpha_presence = params.alpha_presence;
+            const float   alpha_frequency = params.alpha_frequency;
 
             // optionally save the session on first sample (for faster prompt loading next time)
             if (!path_session.empty() && need_to_save_session) {
@@ -402,14 +407,55 @@ int main(int argc, char ** argv) {
 
             {
                 auto logits = llama_get_logits(ctx);
+                auto n_vocab = llama_n_vocab(ctx);
 
                 if (params.ignore_eos) {
-                    logits[llama_token_eos()] = 0;
+                    logits[llama_token_eos()] = -INFINITY;
+                }
+
+                std::vector<llama_token_data> candidates;
+                candidates.reserve(n_vocab);
+                for (size_t i = 0; i < n_vocab; i++) {
+                    candidates.emplace_back(i, logits[i], 0.0f);
                 }
 
-                id = llama_sample_top_p_top_k(ctx,
-                        last_n_tokens.data() + n_ctx - params.repeat_last_n,
-                        params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size() };
+
+                // Apply penalties
+                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+                llama_sample_repetition_penalty(&candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, repeat_penalty);
+                llama_sample_frequency_and_presence_penalties(&candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, alpha_frequency, alpha_presence);
+
+
+#if 1
+                if (temp <= 0) {
+                    // Greedy sampling
+                    id = llama_sample_token_greedy(ctx, &candidates_p);
+                } else {
+                    // Temperature sampling
+                    llama_sample_top_k(&candidates_p, top_k);
+                    llama_sample_tail_free(&candidates_p, tfs_z);
+                    llama_sample_typical(&candidates_p, typical_p);
+                    llama_sample_top_p(&candidates_p, top_p);
+
+                    llama_sample_temperature(&candidates_p, temp);
+                    // printf("`%d`", candidates_p.size);
+                    id = llama_sample_token(ctx, &candidates_p);
+                }
+#else
+                const float tau = 5.0f;
+                static float mu = 2.0f * tau;
+                static int k = 40;
+                const float eta = 0.1f;
+                const int m = 100;
+                const float N = n_vocab;
+                id = llama_sample_mirostat(ctx, &candidates_p, tau, eta, m, N, &k, &mu);
+                // id = llama_sample_mirostat_v2(ctx, &candidates_p, tau, eta, &mu);
+#endif
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})`
`76`	`76`	`# Compile flags`
`77`	`77`	`#`
`78`	`78`
`79`		`-set(CMAKE_CXX_STANDARD 11)`
	`79`	`+set(CMAKE_CXX_STANDARD 20)`
`80`	`80`	`set(CMAKE_CXX_STANDARD_REQUIRED true)`
`81`	`81`	`set(CMAKE_C_STANDARD 11)`
`82`	`82`	`set(CMAKE_C_STANDARD_REQUIRED true)`