@@ -382,7 +382,7 @@ struct llama_server_context
382
382
{
383
383
n_eval = params.n_batch ;
384
384
}
385
- if (llama_eval (ctx, &embd[n_past], n_eval, n_past, params.n_threads , params.n_threads ))
385
+ if (llama_eval (ctx, &embd[n_past], n_eval, n_past, params.n_threads , params.pp_threads ))
386
386
{
387
387
LOG_ERROR (" failed to eval" , {
388
388
{" n_eval" , n_eval},
@@ -648,6 +648,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
648
648
fprintf (stdout, " -h, --help show this help message and exit\n " );
649
649
fprintf (stdout, " -v, --verbose verbose output (default: %s)\n " , server_verbose ? " enabled" : " disabled" );
650
650
fprintf (stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n " , params.n_threads );
651
+ fprintf (stdout, " -ppt N, --pp-threads N\n " );
652
+ fprintf (stdout, " number of threads to use during prompt processing (default: %d)\n " , params.pp_threads );
651
653
fprintf (stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n " , params.n_ctx );
652
654
fprintf (stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n " , params.n_gqa );
653
655
fprintf (stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n " , params.rms_norm_eps );
@@ -818,6 +820,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
818
820
}
819
821
params.n_threads = std::stoi (argv[i]);
820
822
}
823
+ else if (arg == " -ppt" || arg == " --pp-threads" )
824
+ {
825
+ if (++i >= argc)
826
+ {
827
+ invalid_param = true ;
828
+ break ;
829
+ }
830
+ params.pp_threads = std::stoi (argv[i]);
831
+ }
821
832
else if (arg == " -b" || arg == " --batch-size" )
822
833
{
823
834
if (++i >= argc)
@@ -1178,6 +1189,7 @@ int main(int argc, char **argv)
1178
1189
{" commit" , BUILD_COMMIT}});
1179
1190
LOG_INFO (" system info" , {
1180
1191
{" n_threads" , params.n_threads },
1192
+ {" pp_threads" , params.pp_threads },
1181
1193
{" total_threads" , std::thread::hardware_concurrency ()},
1182
1194
{" system_info" , llama_print_system_info ()},
1183
1195
});
0 commit comments