@@ -684,14 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
684
684
}
685
685
if (arg == " --lora" ) {
686
686
CHECK_ARG
687
- params.lora_adapter .emplace_back (argv[i], 1 .0f );
687
+ params.lora_adapters .push_back ({
688
+ std::string (argv[i]),
689
+ 1.0 ,
690
+ });
688
691
return true ;
689
692
}
690
693
if (arg == " --lora-scaled" ) {
691
694
CHECK_ARG
692
- const char * lora_adapter = argv[i];
695
+ std::string lora_adapter = argv[i];
693
696
CHECK_ARG
694
- params.lora_adapter .emplace_back (lora_adapter, std::stof (argv[i]));
697
+ params.lora_adapters .push_back ({
698
+ lora_adapter,
699
+ std::stof (argv[i]),
700
+ });
701
+ return true ;
702
+ }
703
+ if (arg == " --lora-init-without-apply" ) {
704
+ params.lora_init_without_apply = true ;
695
705
return true ;
696
706
}
697
707
if (arg == " --control-vector" ) {
@@ -1654,6 +1664,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
1654
1664
" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
1655
1665
options.push_back ({ " server" , " -sps, --slot-prompt-similarity SIMILARITY" ,
1656
1666
" how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n " , params.slot_prompt_similarity });
1667
+ options.push_back ({ " server" , " --lora-init-without-apply" , " load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)" , params.lora_init_without_apply ? " enabled" : " disabled" });
1657
1668
1658
1669
#ifndef LOG_DISABLE_LOGS
1659
1670
options.push_back ({ " logging" });
@@ -2091,17 +2102,22 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
2091
2102
}
2092
2103
}
2093
2104
2094
- for (unsigned int i = 0 ; i < params.lora_adapter .size (); ++i) {
2095
- const std::string & lora_adapter = std::get<0 >(params.lora_adapter [i]);
2096
- float lora_scale = std::get<1 >(params.lora_adapter [i]);
2097
- auto adapter = llama_lora_adapter_init (model, lora_adapter.c_str ());
2098
- if (adapter == nullptr ) {
2099
- fprintf (stderr, " %s: error: failed to apply lora adapter\n " , __func__);
2105
+ // load and optionally apply lora adapters
2106
+ for (auto & la : params.lora_adapters ) {
2107
+ llama_lora_adapter_container loaded_la;
2108
+ loaded_la.path = la.path ;
2109
+ loaded_la.scale = la.scale ;
2110
+ loaded_la.adapter = llama_lora_adapter_init (model, la.path .c_str ());
2111
+ if (loaded_la.adapter == nullptr ) {
2112
+ fprintf (stderr, " %s: error: failed to apply lora adapter '%s'\n " , __func__, la.path .c_str ());
2100
2113
llama_free (lctx);
2101
2114
llama_free_model (model);
2102
2115
return iparams;
2103
2116
}
2104
- llama_lora_adapter_set (lctx, adapter, lora_scale);
2117
+ iparams.lora_adapters .push_back (loaded_la); // copy to list of loaded adapters
2118
+ }
2119
+ if (!params.lora_init_without_apply ) {
2120
+ llama_lora_adapters_apply (lctx, iparams.lora_adapters );
2105
2121
}
2106
2122
2107
2123
if (params.ignore_eos ) {
@@ -2140,6 +2156,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
2140
2156
return iparams;
2141
2157
}
2142
2158
2159
+ void llama_lora_adapters_apply (struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2160
+ llama_lora_adapter_clear (ctx);
2161
+ for (auto & la : lora_adapters) {
2162
+ if (la.scale != 0 .0f ) {
2163
+ llama_lora_adapter_set (ctx, la.adapter , la.scale );
2164
+ }
2165
+ }
2166
+ }
2167
+
2143
2168
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params) {
2144
2169
auto mparams = llama_model_default_params ();
2145
2170
@@ -3162,19 +3187,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
3162
3187
}
3163
3188
3164
3189
fprintf (stream, " lora:\n " );
3165
- for (std::tuple<std::string, float > la : params.lora_adapter ) {
3166
- if (std::get< 1 >(la) ! = 1 .0f ) {
3167
- continue ;
3190
+ for (auto & la : params.lora_adapters ) {
3191
+ if (la. scale = = 1 .0f ) {
3192
+ fprintf (stream, " - %s \n " , la. path . c_str ()) ;
3168
3193
}
3169
- fprintf (stream, " - %s\n " , std::get<0 >(la).c_str ());
3170
3194
}
3171
3195
fprintf (stream, " lora_scaled:\n " );
3172
- for (std::tuple<std::string, float > la : params.lora_adapter ) {
3173
- if (std::get< 1 >(la) = = 1 .0f ) {
3174
- continue ;
3196
+ for (auto & la : params.lora_adapters ) {
3197
+ if (la. scale ! = 1 .0f ) {
3198
+ fprintf (stream, " - %s: %f \n " , la. path . c_str (), la. scale ) ;
3175
3199
}
3176
- fprintf (stream, " - %s: %f\n " , std::get<0 >(la).c_str (), std::get<1 >(la));
3177
3200
}
3201
+ fprintf (stream, " lora_init_without_apply: %s # default: false\n " , params.lora_init_without_apply ? " true" : " false" );
3178
3202
fprintf (stream, " main_gpu: %d # default: 0\n " , params.main_gpu );
3179
3203
fprintf (stream, " min_keep: %d # default: 0 (disabled)\n " , sparams.min_keep );
3180
3204
fprintf (stream, " mirostat: %d # default: 0 (disabled)\n " , sparams.mirostat );
0 commit comments