Skip to content

Commit f80a3fe

Browse files
CUDA: use mul_mat_q kernels by default
1 parent 5e9ff54 commit f80a3fe

File tree

4 files changed

+16
-17
lines changed

4 files changed

+16
-17
lines changed

examples/common.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -400,11 +400,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
400400
#else
401401
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
402402
#endif // GGML_USE_CUBLAS
403-
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
403+
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
404404
#ifdef GGML_USE_CUBLAS
405-
params.mul_mat_q = true;
405+
params.mul_mat_q = false;
406406
#else
407-
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
407+
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
408408
#endif // GGML_USE_CUBLAS
409409
} else if (arg == "--low-vram" || arg == "-lv") {
410410
#ifdef GGML_USE_CUBLAS
@@ -614,11 +614,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
614614
fprintf(stdout, " number of layers to store in VRAM\n");
615615
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
616616
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
617-
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
618-
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
619-
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
620-
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
621-
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
617+
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
618+
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
619+
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
620+
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
621+
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
622622
#endif
623623
fprintf(stdout, " --mtest compute maximum memory usage\n");
624624
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");

examples/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ struct gpt_params {
6969
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
7070

7171
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
72-
bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
72+
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
7373
bool memory_f16 = true; // use f16 instead of f32 for memory kv
7474
bool random_prompt = false; // do not randomize prompt if none provided
7575
bool use_color = false; // use color to distinguish generations and inputs

examples/server/server.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -673,12 +673,11 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
673673
fprintf(stdout, " number of layers to store in VRAM\n");
674674
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
675675
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
676-
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
677676
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
678677
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
679-
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
680-
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
681-
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
678+
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
679+
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
680+
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
682681
#endif
683682
fprintf(stdout, " -m FNAME, --model FNAME\n");
684683
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
@@ -886,12 +885,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
886885
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
887886
#endif // GGML_USE_CUBLAS
888887
}
889-
else if (arg == "--mul-mat-q" || arg == "-mmq")
888+
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
890889
{
891890
#ifdef GGML_USE_CUBLAS
892-
params.mul_mat_q = true;
891+
params.mul_mat_q = false;
893892
#else
894-
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
893+
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
895894
#endif // GGML_USE_CUBLAS
896895
}
897896
else if (arg == "--main-gpu" || arg == "-mg")

ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ static int g_device_count = -1;
286286
static int g_main_device = 0;
287287
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288288
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289-
static bool g_mul_mat_q = false;
289+
static bool g_mul_mat_q = true;
290290

291291
static void * g_scratch_buffer = nullptr;
292292
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default

0 commit comments

Comments
 (0)