@@ -601,48 +601,48 @@ struct llama_server_context
601
601
static void server_print_usage (const char *argv0, const gpt_params ¶ms,
602
602
const server_params &sparams)
603
603
{
604
- fprintf (stderr , " usage: %s [options]\n " , argv0);
605
- fprintf (stderr , " \n " );
606
- fprintf (stderr , " options:\n " );
607
- fprintf (stderr , " -h, --help show this help message and exit\n " );
608
- fprintf (stderr , " -v, --verbose verbose output (default: %s)\n " , server_verbose ? " enabled" : " disabled" );
609
- fprintf (stderr , " -t N, --threads N number of threads to use during computation (default: %d)\n " , params.n_threads );
610
- fprintf (stderr , " -c N, --ctx-size N size of the prompt context (default: %d)\n " , params.n_ctx );
611
- fprintf (stderr , " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n " , params.n_gqa );
612
- fprintf (stderr , " --rope-freq-base N RoPE base frequency (default: %.1f)\n " , params.rope_freq_base );
613
- fprintf (stderr , " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n " , params.rope_freq_scale );
614
- fprintf (stderr , " -b N, --batch-size N batch size for prompt processing (default: %d)\n " , params.n_batch );
615
- fprintf (stderr , " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n " );
616
- fprintf (stderr , " not recommended: doubles context memory required and no measurable increase in quality\n " );
604
+ fprintf (stdout , " usage: %s [options]\n " , argv0);
605
+ fprintf (stdout , " \n " );
606
+ fprintf (stdout , " options:\n " );
607
+ fprintf (stdout , " -h, --help show this help message and exit\n " );
608
+ fprintf (stdout , " -v, --verbose verbose output (default: %s)\n " , server_verbose ? " enabled" : " disabled" );
609
+ fprintf (stdout , " -t N, --threads N number of threads to use during computation (default: %d)\n " , params.n_threads );
610
+ fprintf (stdout , " -c N, --ctx-size N size of the prompt context (default: %d)\n " , params.n_ctx );
611
+ fprintf (stdout , " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n " , params.n_gqa );
612
+ fprintf (stdout , " --rope-freq-base N RoPE base frequency (default: %.1f)\n " , params.rope_freq_base );
613
+ fprintf (stdout , " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n " , params.rope_freq_scale );
614
+ fprintf (stdout , " -b N, --batch-size N batch size for prompt processing (default: %d)\n " , params.n_batch );
615
+ fprintf (stdout , " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n " );
616
+ fprintf (stdout , " not recommended: doubles context memory required and no measurable increase in quality\n " );
617
617
if (llama_mlock_supported ())
618
618
{
619
- fprintf (stderr , " --mlock force system to keep model in RAM rather than swapping or compressing\n " );
619
+ fprintf (stdout , " --mlock force system to keep model in RAM rather than swapping or compressing\n " );
620
620
}
621
621
if (llama_mmap_supported ())
622
622
{
623
- fprintf (stderr , " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n " );
623
+ fprintf (stdout , " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n " );
624
624
}
625
625
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
626
- fprintf (stderr , " -ngl N, --n-gpu-layers N\n " );
627
- fprintf (stderr , " number of layers to store in VRAM\n " );
628
- fprintf (stderr , " -ts SPLIT --tensor-split SPLIT\n " );
629
- fprintf (stderr , " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
630
- fprintf (stderr , " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
631
- fprintf (stderr , " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
632
- fprintf (stderr , " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
626
+ fprintf (stdout , " -ngl N, --n-gpu-layers N\n " );
627
+ fprintf (stdout , " number of layers to store in VRAM\n " );
628
+ fprintf (stdout , " -ts SPLIT --tensor-split SPLIT\n " );
629
+ fprintf (stdout , " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
630
+ fprintf (stdout , " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
631
+ fprintf (stdout , " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
632
+ fprintf (stdout , " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
633
633
#endif
634
- fprintf (stderr , " -m FNAME, --model FNAME\n " );
635
- fprintf (stderr , " model path (default: %s)\n " , params.model .c_str ());
636
- fprintf (stderr , " -a ALIAS, --alias ALIAS\n " );
637
- fprintf (stderr , " set an alias for the model, will be added as `model` field in completion response\n " );
638
- fprintf (stderr , " --lora FNAME apply LoRA adapter (implies --no-mmap)\n " );
639
- fprintf (stderr , " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n " );
640
- fprintf (stderr , " --host ip address to listen (default (default: %s)\n " , sparams.hostname .c_str ());
641
- fprintf (stderr , " --port PORT port to listen (default (default: %d)\n " , sparams.port );
642
- fprintf (stderr , " --path PUBLIC_PATH path from which to serve static files (default %s)\n " , sparams.public_path .c_str ());
643
- fprintf (stderr , " -to N, --timeout N server read/write timeout in seconds (default: %d)\n " , sparams.read_timeout );
644
- fprintf (stderr , " --embedding enable embedding vector output (default: %s)\n " , params.embedding ? " enabled" : " disabled" );
645
- fprintf (stderr , " \n " );
634
+ fprintf (stdout , " -m FNAME, --model FNAME\n " );
635
+ fprintf (stdout , " model path (default: %s)\n " , params.model .c_str ());
636
+ fprintf (stdout , " -a ALIAS, --alias ALIAS\n " );
637
+ fprintf (stdout , " set an alias for the model, will be added as `model` field in completion response\n " );
638
+ fprintf (stdout , " --lora FNAME apply LoRA adapter (implies --no-mmap)\n " );
639
+ fprintf (stdout , " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n " );
640
+ fprintf (stdout , " --host ip address to listen (default (default: %s)\n " , sparams.hostname .c_str ());
641
+ fprintf (stdout , " --port PORT port to listen (default (default: %d)\n " , sparams.port );
642
+ fprintf (stdout , " --path PUBLIC_PATH path from which to serve static files (default %s)\n " , sparams.public_path .c_str ());
643
+ fprintf (stdout , " -to N, --timeout N server read/write timeout in seconds (default: %d)\n " , sparams.read_timeout );
644
+ fprintf (stdout , " --embedding enable embedding vector output (default: %s)\n " , params.embedding ? " enabled" : " disabled" );
645
+ fprintf (stdout , " \n " );
646
646
}
647
647
648
648
static void server_params_parse (int argc, char **argv, server_params &sparams,
0 commit comments