Skip to content

Commit c3ea58a

Browse files
CUDA: remove DMMV, consolidate F16 mult mat vec (#10318)
1 parent 467576b commit c3ea58a

File tree

10 files changed

+246
-1000
lines changed

10 files changed

+246
-1000
lines changed

Makefile

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -635,10 +635,6 @@ else ifndef CUDA_POWER_ARCH
635635
MK_NVCCFLAGS += -arch=native
636636
endif # CUDA_DOCKER_ARCH
637637

638-
ifdef GGML_CUDA_FORCE_DMMV
639-
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
640-
endif # GGML_CUDA_FORCE_DMMV
641-
642638
ifdef GGML_CUDA_FORCE_MMQ
643639
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
644640
endif # GGML_CUDA_FORCE_MMQ
@@ -647,20 +643,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
647643
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
648644
endif # GGML_CUDA_FORCE_CUBLAS
649645

650-
ifdef GGML_CUDA_DMMV_X
651-
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
652-
else
653-
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
654-
endif # GGML_CUDA_DMMV_X
655-
656-
ifdef GGML_CUDA_MMV_Y
657-
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
658-
else ifdef GGML_CUDA_DMMV_Y
659-
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
660-
else
661-
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
662-
endif # GGML_CUDA_MMV_Y
663-
664646
ifdef GGML_CUDA_F16
665647
MK_NVCCFLAGS += -DGGML_CUDA_F16
666648
endif # GGML_CUDA_F16
@@ -669,12 +651,6 @@ ifdef GGML_CUDA_DMMV_F16
669651
MK_NVCCFLAGS += -DGGML_CUDA_F16
670652
endif # GGML_CUDA_DMMV_F16
671653

672-
ifdef GGML_CUDA_KQUANTS_ITER
673-
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
674-
else
675-
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
676-
endif
677-
678654
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
679655
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
680656
else
@@ -783,10 +759,6 @@ ifdef GGML_HIPBLAS
783759
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
784760
endif
785761

786-
GGML_CUDA_DMMV_X ?= 32
787-
GGML_CUDA_MMV_Y ?= 1
788-
GGML_CUDA_KQUANTS_ITER ?= 2
789-
790762
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
791763

792764
ifdef GGML_HIP_UMA
@@ -800,13 +772,6 @@ endif # GGML_HIP_UMA
800772
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
801773

802774
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
803-
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
804-
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
805-
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
806-
807-
ifdef GGML_CUDA_FORCE_DMMV
808-
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
809-
endif # GGML_CUDA_FORCE_DMMV
810775

811776
ifdef GGML_CUDA_FORCE_MMQ
812777
HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
@@ -869,10 +834,6 @@ ifdef GGML_MUSA
869834

870835
MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
871836

872-
ifdef GGML_CUDA_FORCE_DMMV
873-
MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV
874-
endif # GGML_CUDA_FORCE_DMMV
875-
876837
ifdef GGML_CUDA_FORCE_MMQ
877838
MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
878839
endif # GGML_CUDA_FORCE_MMQ
@@ -881,18 +842,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
881842
MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
882843
endif # GGML_CUDA_FORCE_CUBLAS
883844

884-
ifdef GGML_CUDA_DMMV_X
885-
MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
886-
else
887-
MUSAFLAGS += -DGGML_CUDA_DMMV_X=32
888-
endif # GGML_CUDA_DMMV_X
889-
890-
ifdef GGML_CUDA_MMV_Y
891-
MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
892-
else
893-
MUSAFLAGS += -DGGML_CUDA_MMV_Y=1
894-
endif # GGML_CUDA_MMV_Y
895-
896845
ifdef GGML_CUDA_F16
897846
MUSAFLAGS += -DGGML_CUDA_F16
898847
endif # GGML_CUDA_F16
@@ -901,12 +850,6 @@ ifdef GGML_CUDA_DMMV_F16
901850
MUSAFLAGS += -DGGML_CUDA_F16
902851
endif # GGML_CUDA_DMMV_F16
903852

904-
ifdef GGML_CUDA_KQUANTS_ITER
905-
MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
906-
else
907-
MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2
908-
endif
909-
910853
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
911854
MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
912855
else

docs/build.md

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -186,13 +186,9 @@ The following compilation options are also available to tweak performance:
186186
187187
| Option | Legal values | Default | Description |
188188
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
189-
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
190-
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
191-
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
192189
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
193190
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
194191
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
195-
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
196192
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
197193
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
198194
@@ -268,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
268264
269265
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
270266
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
271-
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
272-
273-
| Option | Legal values | Default | Description |
274-
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
275-
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
276-
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
277-
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
278267
279268
### Vulkan
280269

ggml/CMakeLists.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -128,14 +128,9 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
128128

129129
option(GGML_CUDA "ggml: use CUDA" OFF)
130130
option(GGML_MUSA "ggml: use MUSA" OFF)
131-
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
132131
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
133132
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
134-
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
135-
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
136133
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
137-
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
138-
"ggml: iters./thread per block for Q2_K/Q6_K")
139134
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
140135
"ggml: max. batch size for using peer access")
141136
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)

ggml/src/ggml-cuda/CMakeLists.txt

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -54,21 +54,12 @@ if (CUDAToolkit_FOUND)
5454
target_link_libraries(ggml-cuda PRIVATE ggml-base)
5555
target_include_directories(ggml-cuda PRIVATE . ..)
5656

57-
# TODO: change the definitions to this target only
58-
59-
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
60-
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
61-
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
6257
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
6358

6459
if (GGML_CUDA_GRAPHS)
6560
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
6661
endif()
6762

68-
if (GGML_CUDA_FORCE_DMMV)
69-
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
70-
endif()
71-
7263
if (GGML_CUDA_FORCE_MMQ)
7364
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
7465
endif()
@@ -81,10 +72,6 @@ if (CUDAToolkit_FOUND)
8172
add_compile_definitions(GGML_CUDA_NO_VMM)
8273
endif()
8374

84-
if (DEFINED GGML_CUDA_DMMV_Y)
85-
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
86-
endif()
87-
8875
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
8976
add_compile_definitions(GGML_CUDA_F16)
9077
endif()

0 commit comments

Comments
 (0)