Skip to content

Commit 65e5f6d

Browse files
authored
Fix OpenAI server sampling w.r.t. temp and seed (#4668)
The default values for tfs_z and typical_p were being set to zero, which caused the token candidates array to get shrunk down to one element thus preventing any sampling. Note this only applies to OpenAI API compatible HTTP server requests. The solution is to use the default values that OpenAI documents, as well as ensuring we use the llama.cpp defaults for the rest. I've tested this change still ensures deterministic output by default. If a "temperature" greater than 0 is explicitly passed, then output is unique each time. If "seed" is specified in addition to "temperature" then the output becomes deterministic once more. See Mozilla-Ocho/llamafile#117 See Mozilla-Ocho/llamafile@9e4bf29
1 parent ea5497d commit 65e5f6d

File tree

1 file changed

+19
-12
lines changed

1 file changed

+19
-12
lines changed

examples/server/server.cpp

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,6 @@ struct llama_client_slot
441441
}
442442

443443
images.clear();
444-
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
445444
}
446445

447446
bool has_budget(gpt_params &global_params) {
@@ -921,6 +920,7 @@ struct llama_server_context
921920
llama_sampling_free(slot->ctx_sampling);
922921
}
923922
slot->ctx_sampling = llama_sampling_init(slot->sparams);
923+
llama_set_rng_seed(ctx, slot->params.seed);
924924
slot->command = LOAD_PROMPT;
925925

926926
all_slots_are_idle = false;
@@ -1215,7 +1215,7 @@ struct llama_server_context
12151215
{"n_ctx", slot.n_ctx},
12161216
{"model", params.model_alias},
12171217
{"seed", slot.params.seed},
1218-
{"temp", slot.sparams.temp},
1218+
{"temperature", slot.sparams.temp},
12191219
{"top_k", slot.sparams.top_k},
12201220
{"top_p", slot.sparams.top_p},
12211221
{"min_p", slot.sparams.min_p},
@@ -2437,26 +2437,33 @@ json oaicompat_completion_params_parse(
24372437
llama_params["__oaicompat"] = true;
24382438

24392439
// Map OpenAI parameters to llama.cpp parameters
2440+
//
2441+
// For parameters that are defined by the OpenAI documentation (e.g.
2442+
// temperature), we explicitly specify OpenAI's intended default; we
2443+
// need to do that because sometimes OpenAI disagrees with llama.cpp
2444+
//
2445+
// https://platform.openai.com/docs/api-reference/chat/create
2446+
llama_sampling_params default_sparams;
24402447
llama_params["model"] = json_value(body, "model", std::string("uknown"));
24412448
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
24422449
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
2443-
llama_params["temperature"] = json_value(body, "temperature", 0.8);
2444-
llama_params["top_k"] = json_value(body, "top_k", 40);
2445-
llama_params["top_p"] = json_value(body, "top_p", 0.95);
2450+
llama_params["temperature"] = json_value(body, "temperature", 0.0);
2451+
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
2452+
llama_params["top_p"] = json_value(body, "top_p", 1.0);
24462453
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
24472454
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
24482455
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
24492456
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
2450-
llama_params["seed"] = json_value(body, "seed", 0);
2457+
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
24512458
llama_params["stream"] = json_value(body, "stream", false);
2452-
llama_params["mirostat"] = json_value(body, "mirostat", false);
2453-
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
2454-
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
2455-
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
2456-
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
2459+
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
2460+
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
2461+
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
2462+
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
2463+
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
24572464
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
24582465
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
2459-
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
2466+
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
24602467

24612468
if (body.count("grammar") != 0) {
24622469
llama_params["grammar"] = json_value(body, "grammar", json::object());

0 commit comments

Comments
 (0)