Skip to content

[OpenMP] Use generic IR for the OpenMP DeviceRTL #119091

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1250,6 +1250,9 @@ OpenMP Support
- Added support for 'omp assume' directive.
- Added support for 'omp scope' directive.
- Added support for allocator-modifier in 'allocate' clause.
- Changed the OpenMP DeviceRTL to use 'generic' IR. The
``LIBOMPTARGET_DEVICE_ARCHITECTURES`` CMake argument is now unused and will
always build support for AMDGPU and NVPTX targets.

Improvements
^^^^^^^^^^^^
Expand Down
3 changes: 1 addition & 2 deletions clang/lib/Driver/ToolChains/CommonArgs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2837,8 +2837,7 @@ void tools::addOpenMPDeviceRTL(const Driver &D,
: options::OPT_libomptarget_nvptx_bc_path_EQ;

StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx";
std::string LibOmpTargetName =
("libomptarget-" + ArchPrefix + "-" + BitcodeSuffix + ".bc").str();
std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + ".bc").str();

// First check whether user specifies bc library
if (const Arg *A = DriverArgs.getLastArg(LibomptargetBCPathOpt)) {
Expand Down
1 change: 0 additions & 1 deletion clang/lib/Driver/ToolChains/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,6 @@ void CudaToolChain::addClangTargetOptions(
HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);

StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
DeviceOffloadingKind == Action::OFK_Cuda) &&
"Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
Expand Down
4 changes: 2 additions & 2 deletions clang/test/Driver/openmp-offload-gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@
// RUN: %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s

// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_52.bc
// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_52.bc
// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx.bc
// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx.bc
// CHK-BCLIB-NOT: {{error:|warning:}}

/// ###########################################################################
Expand Down
76 changes: 17 additions & 59 deletions offload/DeviceRTL/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,43 +42,6 @@ set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
set(include_directory ${devicertl_base_directory}/include)
set(source_directory ${devicertl_base_directory}/src)

set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803"
"gfx9-generic;gfx900;gfx902;gfx906;gfx908"
"gfx90a;gfx90c"
"gfx9-4-generic;gfx940;gfx941;gfx942;gfx950"
"gfx10-1-generic;gfx1010;gfx1012"
"gfx10-3-generic;gfx1030;gfx1031;gfx1032;gfx1033"
"gfx1034;gfx1035;gfx1036"
"gfx11-generic;gfx1100;gfx1101;gfx1102;gfx1103"
"gfx1150;gfx1151;gfx1152;gfx1153"
"gfx12-generic")
set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
"sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90")
set(all_gpu_architectures
"${all_amdgpu_architectures};${all_nvptx_architectures}")

set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING
"List of device architectures to be used to compile the OpenMP DeviceRTL.")

if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all")
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures})
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "amdgpu")
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_amdgpu_architectures})
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "nvptx")
set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_nvptx_architectures})
elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native")
if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH)
message(FATAL_ERROR
"Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'")
elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU)
message(FATAL_ERROR "No AMD or NVIDIA GPU found on the system when using 'auto'")
endif()
set(LIBOMPTARGET_DEVICE_ARCHITECTURES
"${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
endif()
list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)

set(include_files
${include_directory}/Allocator.h
${include_directory}/Configuration.h
Expand Down Expand Up @@ -146,20 +109,22 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden

# first create an object target
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
function(compileDeviceRTLLibrary target_cpu target_name target_triple)
function(compileDeviceRTLLibrary target_name target_triple)
set(target_bc_flags ${ARGN})

set(bc_files "")
foreach(src ${src_files})
get_filename_component(infile ${src} ABSOLUTE)
get_filename_component(outfile ${src} NAME)
set(outfile "${outfile}-${target_cpu}.bc")
set(outfile "${outfile}-${target_name}.bc")
set(depfile "${outfile}.d")

# Passing an empty CPU to -march= suppressed target specific metadata.
add_custom_command(OUTPUT ${outfile}
COMMAND ${CLANG_TOOL}
${bc_flags}
--offload-arch=${target_cpu}
-fopenmp-targets=${target_triple}
-Xopenmp-target=${target_triple} -march=
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and nothing after -march=?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so which means there is no arch?

${target_bc_flags}
-MD -MF ${depfile}
${infile} -o ${outfile}
Expand All @@ -182,7 +147,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
list(APPEND bc_files ${outfile})
endforeach()

set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
set(bclib_name "libomptarget-${target_name}.bc")

# Link to a bitcode library.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
Expand Down Expand Up @@ -222,7 +187,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
APPEND)
endif()

set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
set(bclib_target_name "omptarget-${target_name}-bc")
add_custom_target(${bclib_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})

# Copy library to destination.
Expand All @@ -244,7 +209,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
# Package the bitcode in the bitcode and embed it in an ELF for the static library
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
"--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp"
"--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=generic,kind=openmp"
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
COMMENT "Packaging LLVM offloading binary ${bclib_name}.out"
)
Expand All @@ -254,14 +219,14 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
APPEND)
endif()

set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o")
set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}.o")
add_custom_command(OUTPUT ${output_name}
COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib
-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
-o ${output_name}
${source_directory}/Stub.cpp
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} ${source_directory}/Stub.cpp
COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o"
COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}.o"
VERBATIM
)
if(TARGET clang)
Expand All @@ -274,11 +239,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name})

if (CMAKE_EXPORT_COMPILE_COMMANDS)
set(ide_target_name omptarget-ide-${target_name}-${target_cpu})
set(ide_target_name omptarget-ide-${target_name})
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
target_compile_options(${ide_target_name} PRIVATE
-fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode
-mllvm -openmp-opt-disable
-fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
-fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
-foffload-lto -fvisibility=hidden --offload-device-only
-nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
)
Expand All @@ -293,18 +258,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
endif()
endfunction()

# Generate a Bitcode library for all the gpu architectures the user requested.
add_custom_target(omptarget.devicertl.nvptx)
add_custom_target(omptarget.devicertl.amdgpu)
foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
else()
message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
endif()
endforeach()
compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)

add_custom_target(omptarget.devicertl.nvptx)
compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
Comment on lines +262 to +265
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have an option to disable either of amdgpu/nvptx specifically? If anything, because it feels weird having a choice between amdgpu and nvptx plugins, but no choice between devicertl variants.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't feel like it was strictly necessary since if you can build one you can build the other. The only thing you save is disk space, but if you feel like it's really needed I can re-use the architectures thing or something.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't exactly feel strongly about it. It's just that we already provide a switch for amdgpu/nvptx in Gentoo, so it only feels natural for this to respect it.

Also, on a semi-related matter: is there a reason we're installing both separate .bc files and libomptarget.devicertl.a?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really over-complicated, but right now we only use the .bc file for non-LTO NVPTX compilations which get put through each TU in a broken way via -mlink-builtin-bitcode. This is because it would get really slow if we didn't optimize out calls to the runtime.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so it's not accidental/obsolete — that's all I needed to know :-).

Another thing I've noticed that clang needs libomptarget.devicertl.a even when no GPU target is actually used. I've worked around that for now by creating an empty .a file locally, but perhaps that should also be handled better somehow (I can make a PR later if you tell me how you'd like it handled).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose that happens for the CPU targets? Those aren't really used anywhere real, they're just for testing, but even so we probably shouldn't leave them broken. We link the device RTL unconditionally because the link step shouldn't need to know the architectures that were used to compile it. Because these are static libraries they're not extracted if they aren't needed, so it doesn't hurt anything if they're unused. But I guess you can have a situation where the user doesn't build this intentionally, but that's non-default behavior so I've never given it much thought.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. precisely. I've noticed this while building -DLIBOMPTARGET_BUILD_DEVICERTL_BCLIB=OFF -DLIBOMPTARGET_PLUGINS_TO_BUILD=host to run tests on CPU only.


# Archive all the object files generated above into a static library
add_library(omptarget.devicertl STATIC)
Expand Down
10 changes: 1 addition & 9 deletions offload/DeviceRTL/src/Misc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,7 @@ double getWTick() {
}

double getWTime() {
uint64_t NumTicks = 0;
if constexpr (__has_builtin(__builtin_amdgcn_s_sendmsg_rtnl))
NumTicks = __builtin_amdgcn_s_sendmsg_rtnl(0x83);
else if constexpr (__has_builtin(__builtin_amdgcn_s_memrealtime))
NumTicks = __builtin_amdgcn_s_memrealtime();
else if constexpr (__has_builtin(__builtin_amdgcn_s_memtime))
NumTicks = __builtin_amdgcn_s_memtime();

return static_cast<double>(NumTicks) * getWTick();
return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
}

#pragma omp end declare variant
Expand Down
91 changes: 46 additions & 45 deletions offload/DeviceRTL/src/Reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
ShuffleReductFnTy shflFct) {
uint32_t size, remote_id, physical_lane_id;
Expand All @@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
} while (logical_lane_id % 2 == 0 && size > 1);
return (logical_lane_id == 0);
}
#endif

static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
ShuffleReductFnTy shflFct,
Expand All @@ -74,49 +72,53 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
uint32_t NumThreads = omp_get_num_threads();
if (NumThreads == 1)
return 1;
/*
* This reduce function handles reduction within a team. It handles
* parallel regions in both L1 and L2 parallelism levels. It also
* supports Generic, SPMD, and NoOMP modes.
*
* 1. Reduce within a warp.
* 2. Warp master copies value to warp 0 via shared memory.
* 3. Warp 0 reduces to a single value.
* 4. The reduced value is available in the thread that returns 1.
*/

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
uint32_t WarpsNeeded =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
uint32_t WarpId = mapping::getWarpIdInBlock();

// Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
// beyond that, always a multiple of 32. For the SPMD execution mode we may
// have any number of threads.
if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/NumThreads % mapping::getWarpSize(),
/*LaneId=*/mapping::getThreadIdInBlock() %
mapping::getWarpSize());

// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
//
// Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
//
// This reduce function handles reduction within a team. It handles
// parallel regions in both L1 and L2 parallelism levels. It also
// supports Generic, SPMD, and NoOMP modes.
//
// 1. Reduce within a warp.
// 2. Warp master copies value to warp 0 via shared memory.
// 3. Warp 0 reduces to a single value.
// 4. The reduced value is available in the thread that returns 1.
//

if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
#if __has_builtin(__nvvm_reflect)
if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try to make an AMDGPU counterpart for this one, though it doesn't look like necessary for the purpose of OpenMP device runtime.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe @AlexVlx also has interest in that space, so best consult with him as well.

uint32_t WarpsNeeded =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
uint32_t WarpId = mapping::getWarpIdInBlock();

// Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
// beyond that, always a multiple of 32. For the SPMD execution mode we may
// have any number of threads.
if ((NumThreads % mapping::getWarpSize() == 0) ||
(WarpId < WarpsNeeded - 1))
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
gpu_irregular_warp_reduce(
reduce_data, shflFct,
/*LaneCount=*/NumThreads % mapping::getWarpSize(),
/*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());

// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
//
// Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);

if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
}
return BlockThreadId == 0;
}
return BlockThreadId == 0;
#else
#endif
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
if (Liveness == lanes::All) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
Expand Down Expand Up @@ -150,10 +152,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
return BlockThreadId == 0;
}

// Get the OMP thread Id. This is different from BlockThreadId in the case of
// an L2 parallel region.
// Get the OMP thread Id. This is different from BlockThreadId in the case
// of an L2 parallel region.
return BlockThreadId == 0;
#endif // __CUDA_ARCH__ >= 700
}

uint32_t roundToWarpsize(uint32_t s) {
Expand Down
6 changes: 6 additions & 0 deletions openmp/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,9 @@ from the `LLVM releases web site <https://llvm.org/releases/>`_.

Non-comprehensive list of changes in this release
=================================================

Device Runtime
--------------
- Changed the OpenMP DeviceRTL to use 'generic' IR. The
``LIBOMPTARGET_DEVICE_ARCHITECTURES`` CMake argument is now unused and will
always build support for AMDGPU and NVPTX targets.
Loading