@@ -372,8 +372,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
372
372
params.multiline_input = true ;
373
373
} else if (arg == " --simple-io" ) {
374
374
params.simple_io = true ;
375
- } else if (arg == " --hot-plug " ) {
376
- params.hot_plug = true ;
375
+ } else if (arg == " -cb " || arg == " --cont-batching " ) {
376
+ params.cont_batching = true ;
377
377
} else if (arg == " --color" ) {
378
378
params.use_color = true ;
379
379
} else if (arg == " --mlock" ) {
@@ -675,7 +675,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
675
675
printf (" --chunks N max number of chunks to process (default: %d, -1 = all)\n " , params.n_chunks );
676
676
printf (" -np N, --parallel N number of parallel sequences to decode (default: %d)\n " , params.n_parallel );
677
677
printf (" -ns N, --sequences N number of sequences to decode (default: %d)\n " , params.n_sequences );
678
- printf (" --hot-plug enable hot-plugging of new sequences for decoding (default: disabled)\n " );
678
+ printf (" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n " );
679
679
if (llama_mlock_supported ()) {
680
680
printf (" --mlock force system to keep model in RAM rather than swapping or compressing\n " );
681
681
}
@@ -1270,7 +1270,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1270
1270
fprintf (stream, " rope_freq_scale: %f # default: 1.0\n " , params.rope_freq_scale );
1271
1271
fprintf (stream, " seed: %d # default: -1 (random seed)\n " , params.seed );
1272
1272
fprintf (stream, " simple_io: %s # default: false\n " , params.simple_io ? " true" : " false" );
1273
- fprintf (stream, " hot_plug : %s # default: false\n " , params.hot_plug ? " true" : " false" );
1273
+ fprintf (stream, " cont_batching : %s # default: false\n " , params.cont_batching ? " true" : " false" );
1274
1274
fprintf (stream, " temp: %f # default: 0.8\n " , params.temp );
1275
1275
1276
1276
const std::vector<float > tensor_split_vector (params.tensor_split , params.tensor_split + LLAMA_MAX_DEVICES);
0 commit comments