Skip to content

Commit 9ecde9f

Browse files
CUDA: quantized KV support for FA vec
1 parent 9b82476 commit 9ecde9f

File tree

11 files changed

+826
-142
lines changed

11 files changed

+826
-142
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
105105
"llama: max. batch size for using peer access")
106106
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
107107
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
108+
option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF)
108109

109110
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
110111
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
@@ -426,6 +427,9 @@ if (LLAMA_CUDA)
426427
if (LLAMA_CUDA_NO_PEER_COPY)
427428
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
428429
endif()
430+
if (LLAMA_CUDA_FA_ALL_QUANTS)
431+
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
432+
endif()
429433

430434
if (LLAMA_STATIC)
431435
if (WIN32)

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
490490
endif # LLAMA_CUDA_NO_PEER_COPY
491491
ifdef LLAMA_CUDA_CCBIN
492492
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
493-
endif
493+
endif # LLAMA_CUDA_CCBIN
494+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
495+
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
496+
endif # LLAMA_CUDA_FA_ALL_QUANTS
494497

495498
ifdef JETSON_EOL_MODULE_DETECT
496499
define NVCC_COMPILE

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,7 @@ Building the program with BLAS support may lead to some performance improvements
475475
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
476476
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
477477
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
478+
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
478479

479480
- #### hipBLAS
480481

0 commit comments

Comments
 (0)