-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[OpenMP] Use generic IR for the OpenMP DeviceRTL #119091
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,43 +42,6 @@ set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR}) | |
set(include_directory ${devicertl_base_directory}/include) | ||
set(source_directory ${devicertl_base_directory}/src) | ||
|
||
set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803" | ||
"gfx9-generic;gfx900;gfx902;gfx906;gfx908" | ||
"gfx90a;gfx90c" | ||
"gfx9-4-generic;gfx940;gfx941;gfx942;gfx950" | ||
"gfx10-1-generic;gfx1010;gfx1012" | ||
"gfx10-3-generic;gfx1030;gfx1031;gfx1032;gfx1033" | ||
"gfx1034;gfx1035;gfx1036" | ||
"gfx11-generic;gfx1100;gfx1101;gfx1102;gfx1103" | ||
"gfx1150;gfx1151;gfx1152;gfx1153" | ||
"gfx12-generic") | ||
set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" | ||
"sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90") | ||
set(all_gpu_architectures | ||
"${all_amdgpu_architectures};${all_nvptx_architectures}") | ||
|
||
set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING | ||
"List of device architectures to be used to compile the OpenMP DeviceRTL.") | ||
|
||
if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all") | ||
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures}) | ||
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "amdgpu") | ||
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_amdgpu_architectures}) | ||
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "nvptx") | ||
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_nvptx_architectures}) | ||
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR | ||
LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native") | ||
if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH) | ||
message(FATAL_ERROR | ||
"Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'") | ||
elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU) | ||
message(FATAL_ERROR "No AMD or NVIDIA GPU found on the system when using 'auto'") | ||
endif() | ||
set(LIBOMPTARGET_DEVICE_ARCHITECTURES | ||
"${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}") | ||
endif() | ||
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES) | ||
|
||
set(include_files | ||
${include_directory}/Allocator.h | ||
${include_directory}/Configuration.h | ||
|
@@ -146,20 +109,22 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden | |
|
||
# first create an object target | ||
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED) | ||
function(compileDeviceRTLLibrary target_cpu target_name target_triple) | ||
function(compileDeviceRTLLibrary target_name target_triple) | ||
set(target_bc_flags ${ARGN}) | ||
|
||
set(bc_files "") | ||
foreach(src ${src_files}) | ||
get_filename_component(infile ${src} ABSOLUTE) | ||
get_filename_component(outfile ${src} NAME) | ||
set(outfile "${outfile}-${target_cpu}.bc") | ||
set(outfile "${outfile}-${target_name}.bc") | ||
set(depfile "${outfile}.d") | ||
|
||
# Passing an empty CPU to -march= suppressed target specific metadata. | ||
add_custom_command(OUTPUT ${outfile} | ||
COMMAND ${CLANG_TOOL} | ||
${bc_flags} | ||
--offload-arch=${target_cpu} | ||
-fopenmp-targets=${target_triple} | ||
-Xopenmp-target=${target_triple} -march= | ||
${target_bc_flags} | ||
-MD -MF ${depfile} | ||
${infile} -o ${outfile} | ||
|
@@ -182,7 +147,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
list(APPEND bc_files ${outfile}) | ||
endforeach() | ||
|
||
set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc") | ||
set(bclib_name "libomptarget-${target_name}.bc") | ||
|
||
# Link to a bitcode library. | ||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} | ||
|
@@ -222,7 +187,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
APPEND) | ||
endif() | ||
|
||
set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc") | ||
set(bclib_target_name "omptarget-${target_name}-bc") | ||
add_custom_target(${bclib_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}) | ||
|
||
# Copy library to destination. | ||
|
@@ -244,7 +209,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
# Package the bitcode in the bitcode and embed it in an ELF for the static library | ||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} | ||
COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} | ||
"--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp" | ||
"--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=generic,kind=openmp" | ||
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} | ||
COMMENT "Packaging LLVM offloading binary ${bclib_name}.out" | ||
) | ||
|
@@ -254,14 +219,14 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
APPEND) | ||
endif() | ||
|
||
set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o") | ||
set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}.o") | ||
add_custom_command(OUTPUT ${output_name} | ||
COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib | ||
-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} | ||
-o ${output_name} | ||
${source_directory}/Stub.cpp | ||
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} ${source_directory}/Stub.cpp | ||
COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o" | ||
COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}.o" | ||
VERBATIM | ||
) | ||
if(TARGET clang) | ||
|
@@ -274,11 +239,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name}) | ||
|
||
if (CMAKE_EXPORT_COMPILE_COMMANDS) | ||
set(ide_target_name omptarget-ide-${target_name}-${target_cpu}) | ||
set(ide_target_name omptarget-ide-${target_name}) | ||
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files}) | ||
target_compile_options(${ide_target_name} PRIVATE | ||
-fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode | ||
-mllvm -openmp-opt-disable | ||
-fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march= | ||
-fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable | ||
-foffload-lto -fvisibility=hidden --offload-device-only | ||
-nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version | ||
) | ||
|
@@ -293,18 +258,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple) | |
endif() | ||
endfunction() | ||
|
||
# Generate a Bitcode library for all the gpu architectures the user requested. | ||
add_custom_target(omptarget.devicertl.nvptx) | ||
add_custom_target(omptarget.devicertl.amdgpu) | ||
foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES}) | ||
if("${gpu_arch}" IN_LIST all_amdgpu_architectures) | ||
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) | ||
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures) | ||
compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63) | ||
else() | ||
message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'") | ||
endif() | ||
endforeach() | ||
compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) | ||
|
||
add_custom_target(omptarget.devicertl.nvptx) | ||
compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63) | ||
Comment on lines
+262
to
+265
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we have an option to disable either of amdgpu/nvptx specifically? If anything, because it feels weird having a choice between amdgpu and nvptx plugins, but no choice between devicertl variants. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't feel like it was strictly necessary since if you can build one you can build the other. The only thing you save is disk space, but if you feel like it's really needed I can re-use the architectures thing or something. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't exactly feel strongly about it. It's just that we already provide a switch for amdgpu/nvptx in Gentoo, so it only feels natural for this to respect it. Also, on a semi-related matter: is there a reason we're installing both separate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is really over-complicated, but right now we only use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, so it's not accidental/obsolete — that's all I needed to know :-). Another thing I've noticed that clang needs There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose that happens for the CPU targets? Those aren't really used anywhere real, they're just for testing, but even so we probably shouldn't leave them broken. We link the device RTL unconditionally because the link step shouldn't need to know the architectures that were used to compile it. Because these are static libraries they're not extracted if they aren't needed, so it doesn't hurt anything if they're unused. But I guess you can have a situation where the user doesn't build this intentionally, but that's non-default behavior so I've never given it much thought. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah. precisely. I've noticed this while building |
||
|
||
# Archive all the object files generated above into a static library | ||
add_library(omptarget.devicertl STATIC) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, | |
} | ||
} | ||
|
||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 | ||
static uint32_t gpu_irregular_simd_reduce(void *reduce_data, | ||
ShuffleReductFnTy shflFct) { | ||
uint32_t size, remote_id, physical_lane_id; | ||
|
@@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data, | |
} while (logical_lane_id % 2 == 0 && size > 1); | ||
return (logical_lane_id == 0); | ||
} | ||
#endif | ||
|
||
static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, | ||
ShuffleReductFnTy shflFct, | ||
|
@@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, | |
uint32_t NumThreads = omp_get_num_threads(); | ||
if (NumThreads == 1) | ||
return 1; | ||
/* | ||
* This reduce function handles reduction within a team. It handles | ||
* parallel regions in both L1 and L2 parallelism levels. It also | ||
* supports Generic, SPMD, and NoOMP modes. | ||
* | ||
* 1. Reduce within a warp. | ||
* 2. Warp master copies value to warp 0 via shared memory. | ||
* 3. Warp 0 reduces to a single value. | ||
* 4. The reduced value is available in the thread that returns 1. | ||
*/ | ||
|
||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 | ||
uint32_t WarpsNeeded = | ||
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); | ||
uint32_t WarpId = mapping::getWarpIdInBlock(); | ||
|
||
// Volta execution model: | ||
// For the Generic execution mode a parallel region either has 1 thread and | ||
// beyond that, always a multiple of 32. For the SPMD execution mode we may | ||
// have any number of threads. | ||
if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) | ||
gpu_regular_warp_reduce(reduce_data, shflFct); | ||
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. | ||
gpu_irregular_warp_reduce(reduce_data, shflFct, | ||
/*LaneCount=*/NumThreads % mapping::getWarpSize(), | ||
/*LaneId=*/mapping::getThreadIdInBlock() % | ||
mapping::getWarpSize()); | ||
|
||
// When we have more than [mapping::getWarpSize()] number of threads | ||
// a block reduction is performed here. | ||
// | ||
// Only L1 parallel region can enter this if condition. | ||
if (NumThreads > mapping::getWarpSize()) { | ||
// Gather all the reduced values from each warp | ||
// to the first warp. | ||
cpyFct(reduce_data, WarpsNeeded); | ||
// | ||
// This reduce function handles reduction within a team. It handles | ||
// parallel regions in both L1 and L2 parallelism levels. It also | ||
// supports Generic, SPMD, and NoOMP modes. | ||
// | ||
// 1. Reduce within a warp. | ||
// 2. Warp master copies value to warp 0 via shared memory. | ||
// 3. Warp 0 reduces to a single value. | ||
// 4. The reduced value is available in the thread that returns 1. | ||
// | ||
|
||
if (WarpId == 0) | ||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, | ||
BlockThreadId); | ||
#if __has_builtin(__nvvm_reflect) | ||
if (__nvvm_reflect("__CUDA_ARCH") >= 700) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll try to make an AMDGPU counterpart for this one, though it doesn't look like necessary for the purpose of OpenMP device runtime. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe @AlexVlx also has interest in that space, so best consult with him as well. |
||
uint32_t WarpsNeeded = | ||
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); | ||
uint32_t WarpId = mapping::getWarpIdInBlock(); | ||
|
||
// Volta execution model: | ||
// For the Generic execution mode a parallel region either has 1 thread and | ||
// beyond that, always a multiple of 32. For the SPMD execution mode we may | ||
// have any number of threads. | ||
if ((NumThreads % mapping::getWarpSize() == 0) || | ||
(WarpId < WarpsNeeded - 1)) | ||
gpu_regular_warp_reduce(reduce_data, shflFct); | ||
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. | ||
gpu_irregular_warp_reduce( | ||
reduce_data, shflFct, | ||
/*LaneCount=*/NumThreads % mapping::getWarpSize(), | ||
/*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize()); | ||
|
||
// When we have more than [mapping::getWarpSize()] number of threads | ||
// a block reduction is performed here. | ||
// | ||
// Only L1 parallel region can enter this if condition. | ||
if (NumThreads > mapping::getWarpSize()) { | ||
// Gather all the reduced values from each warp | ||
// to the first warp. | ||
cpyFct(reduce_data, WarpsNeeded); | ||
|
||
if (WarpId == 0) | ||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, | ||
BlockThreadId); | ||
} | ||
return BlockThreadId == 0; | ||
} | ||
return BlockThreadId == 0; | ||
#else | ||
#endif | ||
__kmpc_impl_lanemask_t Liveness = mapping::activemask(); | ||
if (Liveness == lanes::All) // Full warp | ||
gpu_regular_warp_reduce(reduce_data, shflFct); | ||
|
@@ -150,10 +152,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, | |
return BlockThreadId == 0; | ||
} | ||
|
||
// Get the OMP thread Id. This is different from BlockThreadId in the case of | ||
// an L2 parallel region. | ||
// Get the OMP thread Id. This is different from BlockThreadId in the case | ||
// of an L2 parallel region. | ||
return BlockThreadId == 0; | ||
#endif // __CUDA_ARCH__ >= 700 | ||
} | ||
|
||
uint32_t roundToWarpsize(uint32_t s) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and nothing after
-march=
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Intentional
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so which means there is no arch?