Skip to content

Commit 672244a

Browse files
CUDA: quantized KV support for FA vec
1 parent 10b1e45 commit 672244a

File tree

11 files changed

+826
-142
lines changed

11 files changed

+826
-142
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
106106
"llama: max. batch size for using peer access")
107107
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
108108
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
109+
option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF)
109110

110111
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
111112
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
@@ -427,6 +428,9 @@ if (LLAMA_CUDA)
427428
if (LLAMA_CUDA_NO_PEER_COPY)
428429
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
429430
endif()
431+
if (LLAMA_CUDA_FA_ALL_QUANTS)
432+
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
433+
endif()
430434

431435
if (LLAMA_STATIC)
432436
if (WIN32)

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
493493
endif # LLAMA_CUDA_NO_PEER_COPY
494494
ifdef LLAMA_CUDA_CCBIN
495495
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
496-
endif
496+
endif # LLAMA_CUDA_CCBIN
497+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
498+
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
499+
endif # LLAMA_CUDA_FA_ALL_QUANTS
497500

498501
ifdef JETSON_EOL_MODULE_DETECT
499502
define NVCC_COMPILE

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,7 @@ Building the program with BLAS support may lead to some performance improvements
481481
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
482482
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
483483
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
484+
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
484485

485486
- #### hipBLAS
486487

0 commit comments

Comments
 (0)