@@ -673,12 +673,11 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
673
673
fprintf (stdout, " number of layers to store in VRAM\n " );
674
674
fprintf (stdout, " -ts SPLIT --tensor-split SPLIT\n " );
675
675
fprintf (stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
676
- fprintf (stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
677
676
fprintf (stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
678
677
fprintf (stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
679
- fprintf (stdout, " -mmq , --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!! \n " );
680
- fprintf (stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed \n " );
681
- fprintf (stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K .\n " );
678
+ fprintf (stdout, " -nommq , --no- mul-mat-q\n " );
679
+ fprintf (stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels. \n " );
680
+ fprintf (stdout, " Not recommended since this is both slower and uses more VRAM .\n " );
682
681
#endif
683
682
fprintf (stdout, " -m FNAME, --model FNAME\n " );
684
683
fprintf (stdout, " model path (default: %s)\n " , params.model .c_str ());
@@ -886,12 +885,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
886
885
LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n " , {});
887
886
#endif // GGML_USE_CUBLAS
888
887
}
889
- else if (arg == " --mul-mat-q" || arg == " -mmq " )
888
+ else if (arg == " --no- mul-mat-q" || arg == " -nommq " )
890
889
{
891
890
#ifdef GGML_USE_CUBLAS
892
- params.mul_mat_q = true ;
891
+ params.mul_mat_q = false ;
893
892
#else
894
- LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels .\n " , {});
893
+ LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect .\n " , {});
895
894
#endif // GGML_USE_CUBLAS
896
895
}
897
896
else if (arg == " --main-gpu" || arg == " -mg" )
0 commit comments