Skip to content

Commit a0c5113

Browse files
committed
Change help from stderr to stdout
1 parent 161c2c6 commit a0c5113

File tree

1 file changed

+34
-34
lines changed

1 file changed

+34
-34
lines changed

examples/server/server.cpp

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -601,48 +601,48 @@ struct llama_server_context
601601
static void server_print_usage(const char *argv0, const gpt_params &params,
602602
const server_params &sparams)
603603
{
604-
fprintf(stderr, "usage: %s [options]\n", argv0);
605-
fprintf(stderr, "\n");
606-
fprintf(stderr, "options:\n");
607-
fprintf(stderr, " -h, --help show this help message and exit\n");
608-
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
609-
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
610-
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
611-
fprintf(stderr, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
612-
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
613-
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
614-
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
615-
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
616-
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
604+
fprintf(stdout, "usage: %s [options]\n", argv0);
605+
fprintf(stdout, "\n");
606+
fprintf(stdout, "options:\n");
607+
fprintf(stdout, " -h, --help show this help message and exit\n");
608+
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
609+
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
610+
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
611+
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
612+
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
613+
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
614+
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
615+
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
616+
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
617617
if (llama_mlock_supported())
618618
{
619-
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
619+
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
620620
}
621621
if (llama_mmap_supported())
622622
{
623-
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
623+
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
624624
}
625625
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
626-
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
627-
fprintf(stderr, " number of layers to store in VRAM\n");
628-
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
629-
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
630-
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
631-
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
632-
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
626+
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
627+
fprintf(stdout, " number of layers to store in VRAM\n");
628+
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
629+
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
630+
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
631+
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
632+
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
633633
#endif
634-
fprintf(stderr, " -m FNAME, --model FNAME\n");
635-
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
636-
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
637-
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
638-
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
639-
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
640-
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
641-
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
642-
fprintf(stderr, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
643-
fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
644-
fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
645-
fprintf(stderr, "\n");
634+
fprintf(stdout, " -m FNAME, --model FNAME\n");
635+
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
636+
fprintf(stdout, " -a ALIAS, --alias ALIAS\n");
637+
fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n");
638+
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
639+
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
640+
fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
641+
fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port);
642+
fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
643+
fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
644+
fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
645+
fprintf(stdout, "\n");
646646
}
647647

648648
static void server_params_parse(int argc, char **argv, server_params &sparams,

0 commit comments

Comments
 (0)