[OpenMP] cmake option LIBOMPTARGET_NVPTX_MAX_SM for nvptx device RTL

ye-luo · shiltian · commit ffd159d8e919 · 2020-09-24T12:39:59.000-04:00
It allows customizing MAX_SM for non-flagship GPU and reduces graphic memory usage. In addition, so far the size is hard-coded up to __CUDA_ARCH__ 700 and is already a hassle for 800. Introduce MAX_SM for 800 and protect future arch Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D88185
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -82,6 +82,11 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
     set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
   endforeach()
 
+  # Override default MAX_SM in src/target_impl.h if requested
+  if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
+    set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+  endif()
+
   # Activate RTL message dumps if requested by the user.
   set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
     "Activate NVPTX device RTL debug messages.")
@@ -96,7 +101,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
   list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory}
                               -I${devicertl_nvptx_directory}/src)
   cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
-      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
+      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG} ${MAX_SM_DEFINITION})
 
   # Install device RTL under the lib destination folder.
   install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
@@ -159,7 +164,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
         get_filename_component(outfile ${src} NAME)
 
         add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
             -c ${infile} -o ${outfile}-sm_${sm}.bc
           DEPENDS ${infile}
           IMPLICIT_DEPENDS CXX ${infile}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -47,16 +47,27 @@
 
 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 600
 #define OMP_STATE_COUNT 32
+#else
+#define OMP_STATE_COUNT 16
+#endif
+
+#if !defined(MAX_SM)
+#if __CUDA_ARCH__ >= 900
+#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
+#elif __CUDA_ARCH__ >= 800
+// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
+// GA102 design has a maxinum of 84 SMs
+#define MAX_SM 108
+#elif __CUDA_ARCH__ >= 700
 #define MAX_SM 84
 #elif __CUDA_ARCH__ >= 600
-#define OMP_STATE_COUNT 32
 #define MAX_SM 56
 #else
-#define OMP_STATE_COUNT 16
 #define MAX_SM 16
 #endif
+#endif
 
 #define OMP_ACTIVE_PARALLEL_LEVEL 128