@@ -2643,6 +2643,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
2643
2643
const std::vector<llama_chat_msg> & msgs,
2644
2644
bool add_ass) {
2645
2645
int alloc_size = 0 ;
2646
+ bool fallback = false ; // indicate if we must fallback to default chatml
2646
2647
std::vector<llama_chat_message> chat;
2647
2648
for (auto & msg : msgs) {
2648
2649
chat.push_back ({msg.role .c_str (), msg.content .c_str ()});
@@ -2655,10 +2656,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
2655
2656
// run the first time to get the total output length
2656
2657
int32_t res = llama_chat_apply_template (model, ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
2657
2658
2659
+ // error: chat template is not supported
2660
+ if (res < 0 ) {
2661
+ if (ptr_tmpl != nullptr ) {
2662
+ // if the custom "tmpl" is not supported, we throw an error
2663
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
2664
+ throw std::runtime_error (" this custom template is not supported" );
2665
+ } else {
2666
+ // If the built-in template is not supported, we default to chatml
2667
+ res = llama_chat_apply_template (nullptr , " chatml" , chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
2668
+ fallback = true ;
2669
+ }
2670
+ }
2671
+
2658
2672
// if it turns out that our buffer is too small, we resize it
2659
2673
if ((size_t ) res > buf.size ()) {
2660
2674
buf.resize (res);
2661
- res = llama_chat_apply_template (model, ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
2675
+ res = llama_chat_apply_template (
2676
+ fallback ? nullptr : model,
2677
+ fallback ? " chatml" : ptr_tmpl,
2678
+ chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
2662
2679
}
2663
2680
2664
2681
std::string formatted_chat (buf.data (), res);
0 commit comments