Skip to content

Commit 4f06592

Browse files
authored
Add gqa parameter support to the server (#2351)
* Add gqa parameter support to the server * Change help from stderr to stdout
1 parent 70d26ac commit 4f06592

File tree

1 file changed

+47
-35
lines changed

1 file changed

+47
-35
lines changed

examples/server/server.cpp

Lines changed: 47 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -601,47 +601,48 @@ struct llama_server_context
601601
static void server_print_usage(const char *argv0, const gpt_params &params,
602602
const server_params &sparams)
603603
{
604-
fprintf(stderr, "usage: %s [options]\n", argv0);
605-
fprintf(stderr, "\n");
606-
fprintf(stderr, "options:\n");
607-
fprintf(stderr, " -h, --help show this help message and exit\n");
608-
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
609-
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
610-
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
611-
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
612-
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
613-
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
614-
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
615-
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
604+
fprintf(stdout, "usage: %s [options]\n", argv0);
605+
fprintf(stdout, "\n");
606+
fprintf(stdout, "options:\n");
607+
fprintf(stdout, " -h, --help show this help message and exit\n");
608+
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
609+
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
610+
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
611+
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
612+
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
613+
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
614+
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
615+
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
616+
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
616617
if (llama_mlock_supported())
617618
{
618-
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
619+
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
619620
}
620621
if (llama_mmap_supported())
621622
{
622-
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
623+
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
623624
}
624625
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
625-
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
626-
fprintf(stderr, " number of layers to store in VRAM\n");
627-
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
628-
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
629-
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
630-
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
631-
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
626+
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
627+
fprintf(stdout, " number of layers to store in VRAM\n");
628+
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
629+
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
630+
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
631+
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
632+
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
632633
#endif
633-
fprintf(stderr, " -m FNAME, --model FNAME\n");
634-
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
635-
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
636-
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
637-
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
638-
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
639-
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
640-
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
641-
fprintf(stderr, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
642-
fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
643-
fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
644-
fprintf(stderr, "\n");
634+
fprintf(stdout, " -m FNAME, --model FNAME\n");
635+
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
636+
fprintf(stdout, " -a ALIAS, --alias ALIAS\n");
637+
fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n");
638+
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
639+
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
640+
fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
641+
fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port);
642+
fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
643+
fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
644+
fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
645+
fprintf(stdout, "\n");
645646
}
646647

647648
static void server_params_parse(int argc, char **argv, server_params &sparams,
@@ -724,17 +725,28 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
724725
}
725726
params.n_ctx = std::stoi(argv[i]);
726727
}
728+
else if (arg == "-gqa" || arg == "--gqa")
729+
{
730+
if (++i >= argc)
731+
{
732+
invalid_param = true;
733+
break;
734+
}
735+
params.n_gqa = std::stoi(argv[i]);
736+
}
727737
else if (arg == "--rope-freq-base")
728738
{
729-
if (++i >= argc) {
739+
if (++i >= argc)
740+
{
730741
invalid_param = true;
731742
break;
732743
}
733744
params.rope_freq_base = std::stof(argv[i]);
734745
}
735746
else if (arg == "--rope-freq-scale")
736747
{
737-
if (++i >= argc) {
748+
if (++i >= argc)
749+
{
738750
invalid_param = true;
739751
break;
740752
}

0 commit comments

Comments
 (0)