cont : drop "penalty prompt" support (#3727)

ggerganov · ggerganov · commit c5734f12740d · 2024-08-12T15:40:11.000+03:00
ggml-ci
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -354,7 +354,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
     llama_token_data_array cur_p = { cur.data(), cur.size(), false };
 
     // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
+    const auto & penalty_tokens = prev;
     const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
     if (penalty_tokens_used_size) {
         const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
diff --git a/common/sampling.h b/common/sampling.h
@@ -56,9 +56,6 @@ typedef struct gpt_sampling_params {
     float       cfg_scale     = 1.f; // how strong is guidance
 
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-
-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
 } gpt_sampling_params;
 
 // general sampler context
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -424,8 +424,6 @@ node index.js
 
     `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
 
-    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
-
     `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
 
     `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
@@ -672,7 +670,6 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
             "stopping_word": ""
         },
         "penalize_nl": true,
-        "penalty_prompt_tokens": [],
         "presence_penalty": 0.0,
         "prompt": "Say hello to llama.cpp",
         "repeat_last_n": 64,
@@ -696,8 +693,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
         "tfs_z": 1.0,
         "top_k": 40,
         "top_p": 0.949999988079071,
-        "typical_p": 1.0,
-        "use_penalty_prompt_tokens": false
+        "typical_p": 1.0
     }
 ]
 ```
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -986,51 +986,6 @@ struct server_context {
             }
         }
 
-        // penalize user-provided tokens
-        {
-            slot.sparams.penalty_prompt_tokens.clear();
-            slot.sparams.use_penalty_prompt_tokens = false;
-
-            const auto & penalty_prompt = data.find("penalty_prompt");
-
-            if (penalty_prompt != data.end()) {
-                if (penalty_prompt->is_string()) {
-                    const auto penalty_prompt_string = penalty_prompt->get<std::string>();
-                    slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false);
-
-                    if (slot.params.n_predict > 0) {
-                        slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
-                    }
-                    slot.sparams.use_penalty_prompt_tokens = true;
-
-                    LOG_VERBOSE("penalty_prompt_tokens", {
-                        {"id_slot", slot.id},
-                        {"tokens",  slot.sparams.penalty_prompt_tokens},
-                    });
-                }
-                else if (penalty_prompt->is_array()) {
-                    const auto n_tokens = penalty_prompt->size();
-                    slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict));
-
-                    const int n_vocab = llama_n_vocab(model);
-                    for (const auto & penalty_token : *penalty_prompt) {
-                        if (penalty_token.is_number_integer()) {
-                            const auto tok = penalty_token.get<llama_token>();
-                            if (tok >= 0 && tok < n_vocab) {
-                                slot.sparams.penalty_prompt_tokens.push_back(tok);
-                            }
-                        }
-                    }
-                    slot.sparams.use_penalty_prompt_tokens = true;
-
-                    LOG_VERBOSE("penalty_prompt_tokens", {
-                        {"id_slot", slot.id},
-                        {"tokens",  slot.sparams.penalty_prompt_tokens},
-                    });
-                }
-            }
-        }
-
         {
             slot.sparams.logit_bias.clear();
 
@@ -1201,11 +1156,6 @@ struct server_context {
         slot.generated_text += token_str;
         slot.has_next_token = true;
 
-        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) {
-            // we can change penalty_prompt_tokens because it is always created from scratch each request
-            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
-        }
-
         // check if there is incomplete UTF-8 character at the end
         bool incomplete = false;
         for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) {
@@ -1346,8 +1296,6 @@ struct server_context {
             {"repeat_penalty",            slot.sparams.penalty_repeat},
             {"presence_penalty",          slot.sparams.penalty_present},
             {"frequency_penalty",         slot.sparams.penalty_freq},
-            {"penalty_prompt_tokens",     slot.sparams.penalty_prompt_tokens},
-            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
             {"mirostat",                  slot.sparams.mirostat},
             {"mirostat_tau",              slot.sparams.mirostat_tau},
             {"mirostat_eta",              slot.sparams.mirostat_eta},
diff --git a/include/llama.h b/include/llama.h
@@ -386,6 +386,7 @@ extern "C" {
         bool     ignore_eos;        // ignore the end-of-sequence token
 
         const char * grammar;
+        const char * grammar_root;
 
         int32_t n_logit_bias;
         const llama_logit_bias * logit_bias;
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -16514,6 +16514,7 @@ struct llama_sampling_params llama_sampling_default_params() {
         /*.penalize_nl       =*/ false,
         /*.ignore_eos        =*/ false,
         /*.grammar           =*/ nullptr,
+        /*.grammar_root      =*/ nullptr,
         /*.n_logit_bias      =*/ 0,
         /*.logit_bias        =*/ nullptr,
     };