Skip to content

Commit 2e911ac

Browse files
committed
[OpenMP] Port the OpenMP device runtime to direct C++ compilation
Summary: This removes the use of OpenMP offloading to build the device runtime. The main benefit here is that we no longer need to rely on offloading semantics to build a device only runtime. Things like variants are now no longer needed and can just be simple if-defs. In the future, I will remove most of the special handling here and fold it into calls to the `<gpuintrin.h>` functions instead. Additionally I will rework the compilation to make this a separate runtime. The current plan is to have this, but make including OpenMP and offloading either automatically add it, or print a warning if it's missing. This will allow us to use a normal CMake workflow and delete all the weird 'lets pull the clang binary out of the build' business. ``` -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload -DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa ``` After that, linking the OpenMP device runtime will be `-Xoffload-linker -lomp`. I.e. no more fat binary business.
1 parent 0c71fdd commit 2e911ac

23 files changed

+34
-184
lines changed

offload/DeviceRTL/CMakeLists.txt

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
9595
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
9696

9797
# Set flags for LLVM Bitcode compilation.
98-
set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
99-
${clang_opt_flags} --offload-device-only
100-
-nocudalib -nogpulib -nogpuinc -nostdlibinc
101-
-fopenmp -fopenmp-cuda-mode
102-
-Wno-unknown-cuda-version -Wno-openmp-target
98+
set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
99+
${clang_opt_flags} -nogpulib -nostdlibinc
100+
-fno-rtti -fno-exceptions -fconvergent-functions
101+
-Wno-unknown-cuda-version
103102
-DOMPTARGET_DEVICE_RUNTIME
104103
-I${include_directory}
105104
-I${devicertl_base_directory}/../include
@@ -123,8 +122,7 @@ function(compileDeviceRTLLibrary target_name target_triple)
123122
add_custom_command(OUTPUT ${outfile}
124123
COMMAND ${CLANG_TOOL}
125124
${bc_flags}
126-
-fopenmp-targets=${target_triple}
127-
-Xopenmp-target=${target_triple} -march=
125+
--target=${target_triple}
128126
${target_bc_flags}
129127
-MD -MF ${depfile}
130128
${infile} -o ${outfile}
@@ -242,10 +240,8 @@ function(compileDeviceRTLLibrary target_name target_triple)
242240
set(ide_target_name omptarget-ide-${target_name})
243241
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
244242
target_compile_options(${ide_target_name} PRIVATE
245-
-fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
246-
-fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
247-
-foffload-lto -fvisibility=hidden --offload-device-only
248-
-nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
243+
-fvisibility=hidden --target=${target_triple}
244+
-nogpulib -nostdlibinc -Wno-unknown-cuda-version
249245
)
250246
target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
251247
target_include_directories(${ide_target_name} PRIVATE

offload/DeviceRTL/include/Allocator.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
// Forward declaration.
1818
struct KernelEnvironmentTy;
1919

20-
#pragma omp begin declare target device_type(nohost)
21-
2220
namespace ompx {
2321

2422
namespace allocator {
@@ -44,6 +42,4 @@ extern "C" {
4442
[[gnu::weak]] void free(void *Ptr);
4543
}
4644

47-
#pragma omp end declare target
48-
4945
#endif

offload/DeviceRTL/include/DeviceTypes.h

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -99,14 +99,7 @@ struct TaskDescriptorTy {
9999
TaskFnTy TaskFn;
100100
};
101101

102-
#pragma omp begin declare variant match(device = {arch(amdgcn)})
103102
using LaneMaskTy = uint64_t;
104-
#pragma omp end declare variant
105-
106-
#pragma omp begin declare variant match( \
107-
device = {arch(amdgcn)}, implementation = {extension(match_none)})
108-
using LaneMaskTy = uint64_t;
109-
#pragma omp end declare variant
110103

111104
namespace lanes {
112105
enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
@@ -163,8 +156,7 @@ typedef enum omp_allocator_handle_t {
163156
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
164157

165158
#define SHARED(NAME) \
166-
NAME [[clang::loader_uninitialized]]; \
167-
OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
159+
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
168160

169161
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
170162
// now that's not the case.

offload/DeviceRTL/include/DeviceUtils.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
#include "DeviceTypes.h"
1616
#include "Shared/Utils.h"
1717

18-
#pragma omp begin declare target device_type(nohost)
19-
2018
namespace utils {
2119

2220
template <typename T> struct type_identity {
@@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr);
9593

9694
} // namespace utils
9795

98-
#pragma omp end declare target
99-
10096
#endif

offload/DeviceRTL/include/Mapping.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,8 @@ enum {
2424
DIM_Z = 2,
2525
};
2626

27-
#pragma omp begin declare target device_type(nohost)
28-
2927
inline constexpr uint32_t MaxThreadsPerTeam = 1024;
3028

31-
#pragma omp end declare target
32-
3329
/// Initialize the mapping machinery.
3430
void init(bool IsSPMD);
3531

offload/DeviceRTL/include/State.h

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
// Forward declaration.
2323
struct KernelEnvironmentTy;
2424

25-
#pragma omp begin declare target device_type(nohost)
26-
2725
namespace ompx {
2826

2927
namespace memory {
@@ -88,8 +86,7 @@ struct TeamStateTy {
8886
ParallelRegionFnTy ParallelRegionFnVar;
8987
};
9088

91-
extern TeamStateTy TeamState;
92-
#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
89+
extern TeamStateTy [[clang::address_space(3)]] TeamState;
9390

9491
struct ThreadStateTy {
9592

@@ -115,8 +112,7 @@ struct ThreadStateTy {
115112
}
116113
};
117114

118-
extern ThreadStateTy **ThreadStates;
119-
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
115+
extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
120116

121117
/// Initialize the state machinery. Must be called by all threads.
122118
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
@@ -378,6 +374,4 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;
378374

379375
} // namespace ompx
380376

381-
#pragma omp end declare target
382-
383377
#endif

offload/DeviceRTL/include/Synchronization.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
#include "DeviceTypes.h"
1616
#include "DeviceUtils.h"
1717

18-
#pragma omp begin declare target device_type(nohost)
19-
2018
namespace ompx {
2119
namespace atomic {
2220

@@ -220,6 +218,4 @@ void system(atomic::OrderingTy Ordering);
220218

221219
} // namespace ompx
222220

223-
#pragma omp end declare target
224-
225221
#endif

offload/DeviceRTL/include/Workshare.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
#ifndef OMPTARGET_WORKSHARE_H
1313
#define OMPTARGET_WORKSHARE_H
1414

15-
#pragma omp begin declare target device_type(nohost)
16-
1715
namespace ompx {
1816

1917
namespace workshare {
@@ -25,6 +23,4 @@ void init(bool IsSPMD);
2523

2624
} // namespace ompx
2725

28-
#pragma omp end declare target
29-
3026
#endif

offload/DeviceRTL/src/Allocator.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919

2020
using namespace ompx;
2121

22-
#pragma omp begin declare target device_type(nohost)
23-
2422
[[gnu::used, gnu::retain, gnu::weak,
2523
gnu::visibility(
2624
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
@@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
7775
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
7876

7977
///}
80-
81-
#pragma omp end declare target

offload/DeviceRTL/src/Configuration.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717

1818
using namespace ompx;
1919

20-
#pragma omp begin declare target device_type(nohost)
21-
2220
// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
2321
[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
2422
[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
@@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() {
8583
return false;
8684
return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
8785
}
88-
89-
#pragma omp end declare target

offload/DeviceRTL/src/Debug.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
using namespace ompx;
2323

24-
#pragma omp begin declare target device_type(nohost)
25-
2624
extern "C" {
2725
void __assert_assume(bool condition) { __builtin_assume(condition); }
2826

@@ -44,5 +42,3 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
4442
__builtin_trap();
4543
}
4644
}
47-
48-
#pragma omp end declare target

offload/DeviceRTL/src/DeviceUtils.cpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,10 @@
1515
#include "Interface.h"
1616
#include "Mapping.h"
1717

18-
#pragma omp begin declare target device_type(nohost)
19-
2018
using namespace ompx;
2119

2220
namespace impl {
2321

24-
bool isSharedMemPtr(const void *Ptr) { return false; }
25-
2622
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
2723
static_assert(sizeof(unsigned long) == 8, "");
2824
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
@@ -42,7 +38,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
4238
/// AMDGCN Implementation
4339
///
4440
///{
45-
#pragma omp begin declare variant match(device = {arch(amdgcn)})
41+
#ifdef __AMDGPU__
4642

4743
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
4844
int Self = mapping::getThreadIdInWarp();
@@ -66,15 +62,13 @@ bool isSharedMemPtr(const void *Ptr) {
6662
return __builtin_amdgcn_is_shared(
6763
(const __attribute__((address_space(0))) void *)Ptr);
6864
}
69-
#pragma omp end declare variant
65+
#endif
7066
///}
7167

7268
/// NVPTX Implementation
7369
///
7470
///{
75-
#pragma omp begin declare variant match( \
76-
device = {arch(nvptx, nvptx64)}, \
77-
implementation = {extension(match_any)})
71+
#ifdef __NVPTX__
7872

7973
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
8074
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
@@ -91,7 +85,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
9185

9286
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
9387

94-
#pragma omp end declare variant
88+
#endif
9589
///}
9690
} // namespace impl
9791

@@ -137,5 +131,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
137131
return utils::shuffleDown(lanes::All, Val, Delta, Width);
138132
}
139133
}
140-
141-
#pragma omp end declare target

offload/DeviceRTL/src/Kernel.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525

2626
using namespace ompx;
2727

28-
#pragma omp begin declare target device_type(nohost)
29-
3028
static void
3129
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
3230
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
@@ -155,5 +153,3 @@ void __kmpc_target_deinit() {
155153

156154
int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
157155
}
158-
159-
#pragma omp end declare target

offload/DeviceRTL/src/LibC.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88

99
#include "LibC.h"
1010

11-
#pragma omp begin declare target device_type(nohost)
12-
1311
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
1412
extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
1513
#else
@@ -48,5 +46,3 @@ namespace ompx {
4846
return ::vprintf(Format, vlist);
4947
}
5048
} // namespace ompx
51-
52-
#pragma omp end declare target

offload/DeviceRTL/src/Mapping.cpp

Lines changed: 5 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,33 +15,17 @@
1515
#include "Interface.h"
1616
#include "State.h"
1717

18-
#pragma omp begin declare target device_type(nohost)
19-
2018
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
2119

2220
using namespace ompx;
2321

2422
namespace ompx {
2523
namespace impl {
2624

27-
// Forward declarations defined to be defined for AMDGCN and NVPTX.
28-
LaneMaskTy activemask();
29-
LaneMaskTy lanemaskLT();
30-
LaneMaskTy lanemaskGT();
31-
uint32_t getThreadIdInWarp();
32-
uint32_t getThreadIdInBlock(int32_t Dim);
33-
uint32_t getNumberOfThreadsInBlock(int32_t Dim);
34-
uint32_t getNumberOfThreadsInKernel();
35-
uint32_t getBlockIdInKernel(int32_t Dim);
36-
uint32_t getNumberOfBlocksInKernel(int32_t Dim);
37-
uint32_t getWarpIdInBlock();
38-
uint32_t getNumberOfWarpsInBlock();
39-
uint32_t getWarpSize();
40-
4125
/// AMDGCN Implementation
4226
///
4327
///{
44-
#pragma omp begin declare variant match(device = {arch(amdgcn)})
28+
#ifdef __AMDGPU__
4529

4630
uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
4731

@@ -128,15 +112,13 @@ uint32_t getNumberOfWarpsInBlock() {
128112
return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
129113
}
130114

131-
#pragma omp end declare variant
115+
#endif
132116
///}
133117

134118
/// NVPTX Implementation
135119
///
136120
///{
137-
#pragma omp begin declare variant match( \
138-
device = {arch(nvptx, nvptx64)}, \
139-
implementation = {extension(match_any)})
121+
#ifdef __NVPTX__
140122

141123
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
142124
switch (Dim) {
@@ -214,7 +196,7 @@ uint32_t getNumberOfWarpsInBlock() {
214196
mapping::getWarpSize();
215197
}
216198

217-
#pragma omp end declare variant
199+
#endif
218200
///}
219201

220202
} // namespace impl
@@ -376,7 +358,7 @@ float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
376358
}
377359

378360
long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
379-
return utils::shuffleDown(mask, var, delta, width);
361+
return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
380362
}
381363

382364
double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
@@ -385,5 +367,3 @@ double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
385367
utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
386368
}
387369
}
388-
389-
#pragma omp end declare target

0 commit comments

Comments
 (0)