Skip to content

Commit 7acec5c

Browse files
ngxsonmglambda
authored andcommitted
server : clean up built-in template detection (ggml-org#11026)
* server : clean up built-in template detection * fix compilation * add chat template test * fix condition
1 parent 0eee251 commit 7acec5c

File tree

6 files changed

+44
-27
lines changed

6 files changed

+44
-27
lines changed

common/common.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1614,6 +1614,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
16141614
// Chat template utils
16151615
//
16161616

1617+
std::string common_get_builtin_chat_template(const struct llama_model * model) {
1618+
static const char * template_key = "tokenizer.chat_template";
1619+
// call with NULL buffer to get the total size of the string
1620+
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
1621+
if (res > 0) {
1622+
std::vector<char> model_template(res + 1, 0);
1623+
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
1624+
return std::string(model_template.data(), model_template.size() - 1);
1625+
}
1626+
return "";
1627+
}
1628+
16171629
bool common_chat_verify_template(const std::string & tmpl) {
16181630
llama_chat_message chat[] = {{"user", "test"}};
16191631
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,9 @@ struct common_chat_msg {
571571
std::string content;
572572
};
573573

574+
// Get the built-in chat template for the model. Return empty string if not present.
575+
std::string common_get_builtin_chat_template(const struct llama_model * model);
576+
574577
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
575578
bool common_chat_verify_template(const std::string & tmpl);
576579

examples/server/server.cpp

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1724,17 +1724,10 @@ struct server_context {
17241724
return true;
17251725
}
17261726

1727-
bool validate_model_chat_template() const {
1728-
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
1729-
std::string template_key = "tokenizer.chat_template";
1730-
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
1731-
if (res >= 0) {
1732-
llama_chat_message chat[] = {{"user", "test"}};
1733-
std::string tmpl = std::string(model_template.data(), model_template.size());
1734-
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
1735-
return chat_res > 0;
1736-
}
1737-
return false;
1727+
bool validate_builtin_chat_template() const {
1728+
llama_chat_message chat[] = {{"user", "test"}};
1729+
int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
1730+
return chat_res > 0;
17381731
}
17391732

17401733
void init() {
@@ -3583,7 +3576,7 @@ int main(int argc, char ** argv) {
35833576
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
35843577
{ "total_slots", ctx_server.params_base.n_parallel },
35853578
{ "model_path", ctx_server.params_base.model },
3586-
{ "chat_template", llama_get_chat_template(ctx_server.model) },
3579+
{ "chat_template", common_get_builtin_chat_template(ctx_server.model) },
35873580
{ "build_info", build_info },
35883581
};
35893582

@@ -4223,14 +4216,16 @@ int main(int argc, char ** argv) {
42234216

42244217
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
42254218
if (params.chat_template.empty()) {
4226-
if (!ctx_server.validate_model_chat_template()) {
4219+
if (!ctx_server.validate_builtin_chat_template()) {
42274220
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
42284221
params.chat_template = "chatml";
42294222
}
42304223
}
42314224

42324225
// print sample chat example to make it clear which template is used
4233-
LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
4226+
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
4227+
params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
4228+
common_chat_format_example(ctx_server.model, params.chat_template).c_str());
42344229

42354230
ctx_server.queue_tasks.on_new_task(std::bind(
42364231
&server_context::process_single_task, &ctx_server, std::placeholders::_1));

examples/server/tests/unit/test_chat_completion.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,23 @@ def test_chat_completion_with_openai_library():
100100
assert match_regex("(Suddenly)+", res.choices[0].message.content)
101101

102102

103+
def test_chat_template():
104+
global server
105+
server.chat_template = "llama3"
106+
server.debug = True # to get the "__verbose" object in the response
107+
server.start()
108+
res = server.make_request("POST", "/chat/completions", data={
109+
"max_tokens": 8,
110+
"messages": [
111+
{"role": "system", "content": "Book"},
112+
{"role": "user", "content": "What is the best book"},
113+
]
114+
})
115+
assert res.status_code == 200
116+
assert "__verbose" in res.body
117+
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
118+
119+
103120
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
104121
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
105122
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),

examples/server/tests/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ class ServerProcess:
7474
draft_min: int | None = None
7575
draft_max: int | None = None
7676
no_webui: bool | None = None
77+
chat_template: str | None = None
7778

7879
# session variables
7980
process: subprocess.Popen | None = None
@@ -164,6 +165,8 @@ def start(self, timeout_seconds: int = 10) -> None:
164165
server_args.extend(["--draft-min", self.draft_min])
165166
if self.no_webui:
166167
server_args.append("--no-webui")
168+
if self.chat_template:
169+
server_args.extend(["--chat-template", self.chat_template])
167170

168171
args = [str(arg) for arg in [server_path, *server_args]]
169172
print(f"bench: starting server with: {' '.join(args)}")

examples/server/utils.hpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -382,19 +382,6 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
382382
return formatted_chat;
383383
}
384384

385-
static std::string llama_get_chat_template(const struct llama_model * model) {
386-
std::string template_key = "tokenizer.chat_template";
387-
// call with NULL buffer to get the total size of the string
388-
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
389-
if (res < 2) {
390-
return "";
391-
} else {
392-
std::vector<char> model_template(res + 1, 0);
393-
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
394-
return std::string(model_template.data(), model_template.size() - 1);
395-
}
396-
}
397-
398385
//
399386
// base64 utils (TODO: move to common in the future)
400387
//

0 commit comments

Comments
 (0)