Skip to content

Commit b200d49

Browse files
ochafikggerganov
authored andcommitted
tool-call: fix llama 3.x and functionary 3.2, play nice w/ pydantic_ai package, update readme (ggml-org#11539)
* An empty tool_call_id is better than none! * sync: minja (tool call name optional google/minja#36) * Force-disable parallel_tool_calls if template doesn't support it * More debug logs * Llama 3.x tools: accept / trigger on more varied spaced outputs * Fix empty content for functionary v3.2 tool call * Add proper tool call docs to server README * readme: function calling *is* supported now * Apply suggestions from code review Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 0f92342 commit b200d49

File tree

5 files changed

+130
-16
lines changed

5 files changed

+130
-16
lines changed

common/chat-template.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,10 +283,12 @@ class chat_template {
283283
message["role"] = "user";
284284
auto obj = json {
285285
{"tool_response", {
286-
{"tool", message.at("name")},
287286
{"content", message.at("content")},
288287
}},
289288
};
289+
if (message.contains("name")) {
290+
obj["tool_response"]["name"] = message.at("name");
291+
}
290292
if (message.contains("tool_call_id")) {
291293
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
292294
}

common/chat.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -384,14 +384,19 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
384384
tool_rules.push_back(
385385
builder.add_rule(
386386
name + "-call",
387-
"\"{\" ( \"\\\"type\\\": \\\"function\\\", \" | space ) "
387+
"\"{\" space "
388+
"( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
388389
"\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
389390
builder.add_schema(name + "-args", parameters) +
390391
" \"}\""));
391392
data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
392393
});
393394
data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
395+
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
396+
data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true});
394397
data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
398+
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
399+
data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true});
395400
if (!builtin_tools.empty()) {
396401
data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
397402
}
@@ -586,9 +591,17 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
586591
}
587592
}
588593
// TODO: tighten & simplify.
589-
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
590-
res.content = content;
591-
return res;
594+
try {
595+
auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
596+
res.content = content + res.content;
597+
return res;
598+
} catch (const std::exception & e) {
599+
LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
600+
common_chat_msg res;
601+
res.role = "assistant";
602+
res.content = input;
603+
return res;
604+
}
592605
}
593606

594607
static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {

examples/server/README.md

Lines changed: 103 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co
126126
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
127127
| `--grammar-file FNAME` | file to read grammar from |
128128
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
129-
| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) |
129+
| `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
130130

131131
**Example-specific params**
132132

@@ -1069,7 +1069,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
10691069

10701070
*Options:*
10711071

1072-
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
1072+
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.
10731073

10741074
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
10751075

@@ -1117,17 +1117,111 @@ curl http://localhost:8080/v1/chat/completions \
11171117
}'
11181118
```
11191119

1120-
... and even tool usage (needs `--jinja` flag):
1120+
*Tool call support*
11211121

1122-
```shell
1123-
llama-server --jinja -hfr lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF -hff Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf -fa
1122+
[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639):
1123+
1124+
- Requires `--jinja` flag
1125+
- Native tool call formats supported:
1126+
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
1127+
- Functionary v3.1 / v3.2
1128+
- Hermes 2/3, Qwen 2.5
1129+
- Mistral Nemo
1130+
- Firefunction v2
1131+
- DeepSeek R1 (WIP / seems reluctant to call any tools?)
1132+
1133+
<details>
1134+
<summary>Show some common templates and which format handler they use</summary>
1135+
1136+
| Template | Format |
1137+
|----------|--------|
1138+
| CohereForAI-c4ai-command-r-plus-default.jinja | generic tool calls |
1139+
| CohereForAI-c4ai-command-r-plus-rag.jinja | generic tool calls |
1140+
| CohereForAI-c4ai-command-r-plus-tool_use.jinja | generic tool calls |
1141+
| MiniMaxAI-MiniMax-Text-01.jinja | generic tool calls |
1142+
| NexaAIDev-Octopus-v2.jinja | generic tool calls |
1143+
| NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | generic tool calls |
1144+
| NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | hermes 2 pro tool calls |
1145+
| NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | generic tool calls |
1146+
| NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | hermes 2 pro tool calls |
1147+
| NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | generic tool calls |
1148+
| NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | hermes 2 pro tool calls |
1149+
| OrionStarAI-Orion-14B-Chat.jinja | generic tool calls |
1150+
| Qwen-QwQ-32B-Preview.jinja | hermes 2 pro tool calls |
1151+
| Qwen-Qwen2-7B-Instruct.jinja | generic tool calls |
1152+
| Qwen-Qwen2-VL-7B-Instruct.jinja | generic tool calls |
1153+
| Qwen-Qwen2.5-7B-Instruct.jinja | hermes 2 pro tool calls |
1154+
| Qwen-Qwen2.5-Math-7B-Instruct.jinja | hermes 2 pro tool calls |
1155+
| TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | generic tool calls |
1156+
| abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | generic tool calls |
1157+
| bofenghuang-vigogne-2-70b-chat.jinja | generic tool calls |
1158+
| databricks-dbrx-instruct.jinja | generic tool calls |
1159+
| deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | generic tool calls |
1160+
| deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | deepseek r1 tool calls |
1161+
| deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | deepseek r1 tool calls |
1162+
| deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | deepseek r1 tool calls |
1163+
| deepseek-ai-DeepSeek-V2.5.jinja | deepseek r1 tool calls |
1164+
| deepseek-ai-deepseek-coder-33b-instruct.jinja | generic tool calls |
1165+
| google-gemma-2-2b-it.jinja | generic tool calls |
1166+
| google-gemma-7b-it.jinja | generic tool calls |
1167+
| indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | generic tool calls |
1168+
| mattshumer-Reflection-Llama-3.1-70B.jinja | generic tool calls |
1169+
| meetkai-functionary-medium-v3.2.jinja | functionary v3.2 tool calls |
1170+
| meta-llama-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
1171+
| meta-llama-Llama-3.2-3B-Instruct.jinja | llama 3.x tool calls |
1172+
| meta-llama-Llama-3.3-70B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
1173+
| meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | llama 3.x tool calls (w/ builtin tools) |
1174+
| microsoft-Phi-3-medium-4k-instruct.jinja | generic tool calls |
1175+
| microsoft-Phi-3-mini-4k-instruct.jinja | generic tool calls |
1176+
| microsoft-Phi-3-small-8k-instruct.jinja | generic tool calls |
1177+
| microsoft-Phi-3.5-mini-instruct.jinja | generic tool calls |
1178+
| microsoft-Phi-3.5-vision-instruct.jinja | generic tool calls |
1179+
| mistralai-Mistral-7B-Instruct-v0.2.jinja | generic tool calls |
1180+
| mistralai-Mistral-Large-Instruct-2407.jinja | mistral nemo tool calls |
1181+
| mistralai-Mistral-Large-Instruct-2411.jinja | generic tool calls |
1182+
| mistralai-Mistral-Nemo-Instruct-2407.jinja | mistral nemo tool calls |
1183+
| mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | generic tool calls |
1184+
| mlabonne-AlphaMonarch-7B.jinja | generic tool calls |
1185+
| nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | llama 3.x tool calls (w/ builtin tools) |
1186+
| openchat-openchat-3.5-0106.jinja | generic tool calls |
1187+
| teknium-OpenHermes-2.5-Mistral-7B.jinja | generic tool calls |
1188+
1189+
This table can be generated with:
11241190

1125-
# https://huggingface.co/meetkai/functionary-medium-v3.2
1126-
llama-server --jinja -hfr bartowski/functionary-medium-v3.2-GGUF -hff functionary-medium-v3.2-IQ4_XS.gguf -fa
1191+
```bash
1192+
./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
1193+
1194+
</details>
11271195

1128-
# https://huggingface.co/meetkai/functionary-medium-v3.1
1129-
llama-server --jinja -hfr meetkai/functionary-medium-v3.1-GGUF -hff functionary-medium-llama-3.1.Q4_0.gguf -fa
1196+
- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
1197+
- Use `--chat-template-file` to override the template when appropriate (see examples below)
1198+
- Generic support may consume more tokens and be less efficient than a model's native format.
11301199
1200+
- Run with:
1201+
1202+
```shell
1203+
# Native support:
1204+
llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
1205+
llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M
1206+
llama-server --jinja -fa -hf bartowski/Llama-3.2-3B-Instruct-GGUF:Q6_K
1207+
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
1208+
llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
1209+
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B )
1210+
1211+
# Native support requires the right template for these GGUFs:
1212+
llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
1213+
--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
1214+
llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
1215+
--chat-template-file <( python scripts/get_chat_template.py fireworks-ai/firellama-3-firefunction-v2 )
1216+
1217+
# Generic format support
1218+
llama-server --jinja -fa -hf bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M
1219+
llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q4_K_M
1220+
```
1221+
1222+
- Test in CLI:
1223+
1224+
```bash
11311225
curl http://localhost:8080/v1/chat/completions -d '{
11321226
"model": "gpt-3.5-turbo",
11331227
"tools": [

examples/server/server.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ struct server_task {
345345
auto it = data.find("chat_format");
346346
if (it != data.end()) {
347347
params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
348-
LOG_DBG("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
348+
LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
349349
} else {
350350
params.oaicompat_chat_format = defaults.oaicompat_chat_format;
351351
}
@@ -697,6 +697,7 @@ struct server_task_result_cmpl_final : server_task_result {
697697
std::string finish_reason = "length";
698698
common_chat_msg message;
699699
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
700+
LOG_DBG("Parsing chat message: %s\n", content.c_str());
700701
message = common_chat_parse(content, oaicompat_chat_format);
701702
finish_reason = message.tool_calls.empty() ? "stop" : "tool_calls";
702703
} else {
@@ -713,7 +714,7 @@ struct server_task_result_cmpl_final : server_task_result {
713714
{"name", tc.name},
714715
{"arguments", tc.arguments},
715716
}},
716-
{"id", tc.id.empty() ? json() : json(tc.id)},
717+
{"id", tc.id},
717718
});
718719
}
719720
}

examples/server/utils.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,10 @@ static json oaicompat_completion_params_parse(
641641
inputs.tools = tools;
642642
inputs.tool_choice = tool_choice;
643643
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
644+
if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
645+
LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
646+
inputs.parallel_tool_calls = false;
647+
}
644648
inputs.stream = stream;
645649
// TODO: support mixing schema w/ tools beyond generic format.
646650
inputs.json_schema = json_value(llama_params, "json_schema", json());

0 commit comments

Comments
 (0)