Skip to content

Commit eda5a60

Browse files
committed
CUDA: build archs as virtual for GGML_NATIVE=OFF
To speed up compilation time and reduce binary size. Link : ggml-org/llama.cpp#13135 Author : Johannes Gaessler.
1 parent 00c3682 commit eda5a60

File tree

1 file changed

+28
-7
lines changed

1 file changed

+28
-7
lines changed

ggml/src/CMakeLists.txt

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -320,17 +320,38 @@ if (GGML_CUDA)
320320
message(STATUS "CUDA found")
321321

322322
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
323-
# 52 == lowest CUDA 12 standard
324-
# 60 == FP16 CUDA intrinsics
325-
# 61 == integer CUDA intrinsics
326-
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
323+
# native == GPUs available at build time
324+
# 50 == Maxwell, lowest CUDA 12 standard
325+
# 60 == P100, FP16 CUDA intrinsics
326+
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
327+
# 70 == V100, FP16 tensor cores
328+
# 75 == Turing, int8 tensor cores
329+
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
330+
# 86 == RTX 3000, needs CUDA v11.1
331+
# 89 == RTX 4000, needs CUDA v11.8
332+
#
333+
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
334+
# XX-real == compile CUDA code as device code for this specific architecture
335+
# no suffix == compile as both PTX and device code
336+
#
337+
# The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
338+
# for best performance and to also build real architectures for the most commonly used GPUs.
327339
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
328340
set(CMAKE_CUDA_ARCHITECTURES "native")
329341
elseif (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
330-
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
342+
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
343+
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
344+
else()
345+
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
346+
endif()
347+
elseif()
348+
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
349+
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
350+
else()
351+
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
352+
endif()
331353
else()
332-
set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80")
333-
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
354+
set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
334355
endif()
335356
endif()
336357
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

0 commit comments

Comments
 (0)