Skip to content

Commit ffd159d

Browse files
ye-luoshiltian
authored andcommitted
[OpenMP] cmake option LIBOMPTARGET_NVPTX_MAX_SM for nvptx device RTL
It allows customizing MAX_SM for non-flagship GPU and reduces graphic memory usage. In addition, so far the size is hard-coded up to __CUDA_ARCH__ 700 and is already a hassle for 800. Introduce MAX_SM for 800 and protect future arch Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D88185
1 parent ada1e2f commit ffd159d

File tree

2 files changed

+21
-5
lines changed

2 files changed

+21
-5
lines changed

openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
8282
set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
8383
endforeach()
8484

85+
# Override default MAX_SM in src/target_impl.h if requested
86+
if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
87+
set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
88+
endif()
89+
8590
# Activate RTL message dumps if requested by the user.
8691
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
8792
"Activate NVPTX device RTL debug messages.")
@@ -96,7 +101,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
96101
list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}
97102
-I${devicertl_nvptx_directory}/src)
98103
cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
99-
OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
104+
OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG} ${MAX_SM_DEFINITION})
100105

101106
# Install device RTL under the lib destination folder.
102107
install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
@@ -159,7 +164,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
159164
get_filename_component(outfile ${src} NAME)
160165

161166
add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
162-
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
167+
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
163168
-c ${infile} -o ${outfile}-sm_${sm}.bc
164169
DEPENDS ${infile}
165170
IMPLICIT_DEPENDS CXX ${infile}

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,27 @@
4747

4848
// Maximum number of omp state objects per SM allocated statically in global
4949
// memory.
50-
#if __CUDA_ARCH__ >= 700
50+
#if __CUDA_ARCH__ >= 600
5151
#define OMP_STATE_COUNT 32
52+
#else
53+
#define OMP_STATE_COUNT 16
54+
#endif
55+
56+
#if !defined(MAX_SM)
57+
#if __CUDA_ARCH__ >= 900
58+
#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
59+
#elif __CUDA_ARCH__ >= 800
60+
// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
61+
// GA102 design has a maxinum of 84 SMs
62+
#define MAX_SM 108
63+
#elif __CUDA_ARCH__ >= 700
5264
#define MAX_SM 84
5365
#elif __CUDA_ARCH__ >= 600
54-
#define OMP_STATE_COUNT 32
5566
#define MAX_SM 56
5667
#else
57-
#define OMP_STATE_COUNT 16
5868
#define MAX_SM 16
5969
#endif
70+
#endif
6071

6172
#define OMP_ACTIVE_PARALLEL_LEVEL 128
6273

0 commit comments

Comments
 (0)