Skip to content

Commit b4534dd

Browse files
authored
Merge pull request llvm#490 from AMD-Lightning-Internal/amd/dev/rlieberm/jhuber-preland-123673
preland 123673 [OpenMP] Port the OpenMP device runtime to direct C++ compila…
2 parents b1c8b75 + f120a43 commit b4534dd

31 files changed

+56
-245
lines changed

clang/lib/Headers/__clang_hip_math.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@
2121
#include <limits.h>
2222
#include <stdint.h>
2323
#ifdef __OPENMP_AMDGCN__
24+
// FIXME: A hack for the OpenMP DeviceRTL's `LibM.h` that should be removed.
25+
#ifndef __OPENMP_SKIP_INCLUDE__
2426
#include <omp.h>
2527
#endif
28+
#endif
2629
#endif // !defined(__HIPCC_RTC__)
2730

2831
// __DEVICE__ is a helper macro with common set of attributes for the wrappers

offload/DeviceRTL/CMakeLists.txt

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ set(src_files
9090
# propagation. That said, we will run the vectorizer again after the runtime
9191
# has been linked into the user program.
9292
set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
93-
set(link_opt_flags -O3 -openmp-opt-disable -attributor-enable=module -vectorize-slp=false )
93+
set(link_opt_flags -O3 -openmp-opt-disable -attributor-enable=module -vectorize-slp=false )
9494
set(link_export_flag -passes=internalize -internalize-public-api-file=${source_directory}/exports)
9595

9696
# If the user built with the GPU C library enabled we will use that instead.
@@ -103,11 +103,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
103103
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
104104

105105
# Set flags for LLVM Bitcode compilation.
106-
set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
107-
${clang_opt_flags} --offload-device-only
108-
-nocudalib -nogpulib -nogpuinc
109-
-fopenmp -fopenmp-cuda-mode
110-
-Wno-unknown-cuda-version -Wno-openmp-target
106+
set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
107+
${clang_opt_flags} -nocudalib -nogpulib
108+
-nogpuinc -nostdlibinc -Wno-unknown-cuda-version
109+
-fno-rtti -fno-exceptions -fconvergent-functions
111110
-I${CMAKE_BINARY_DIR}/openmp/runtime/src # Need omp.h for LibM.
112111
-I${CMAKE_BINARY_DIR}/projects/openmp/runtime/src # Need omp.h for LibM.
113112
-I${CMAKE_BINARY_DIR}/runtime/src
@@ -139,9 +138,8 @@ function(compileDeviceRTLLibrary target_name target_triple)
139138
add_custom_target(${outfile}
140139
COMMAND ${CLANG_TOOL}
141140
${bc_flags}
142-
-fopenmp-targets=${target_triple}
143-
-Xopenmp-target=${target_triple} -march=
144141
${target_bc_flags}
142+
--target=${target_triple}
145143
-MD -MF ${depfile}
146144
${infile} -o ${outfile}
147145
DEPENDS ${infile} ${include_files}
@@ -310,10 +308,7 @@ function(compileDeviceRTLLibrary target_name target_triple)
310308
set(ide_target_name omptarget-ide-${target_name})
311309
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
312310
target_compile_options(${ide_target_name} PRIVATE
313-
-fopenmp -fopenmp-cuda-mode
314-
-fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
315-
-mllvm -openmp-opt-disable
316-
-foffload-lto -fvisibility=hidden --offload-device-only
311+
--target=${target_triple} -flto -fvisibility=hidden
317312
-nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
318313
)
319314
target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)

offload/DeviceRTL/include/Allocator.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
// Forward declaration.
1818
struct KernelEnvironmentTy;
1919

20-
#pragma omp begin declare target device_type(nohost)
21-
2220
namespace ompx {
2321

2422
namespace allocator {
@@ -44,6 +42,4 @@ extern "C" {
4442
[[gnu::weak]] void free(void *Ptr);
4543
}
4644

47-
#pragma omp end declare target
48-
4945
#endif

offload/DeviceRTL/include/DevRTLExtras.h

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,7 @@
1313
#ifndef OPENMP_LIBOMPTARGET_DEVICERTL_INCLUDE_DEVRTLEXTRAS_H
1414
#define OPENMP_LIBOMPTARGET_DEVICERTL_INCLUDE_DEVRTLEXTRAS_H
1515

16-
/// Base type declarations for freestanding mode
17-
///
18-
///{
19-
using uint64_t = unsigned long;
20-
// TODO: Properly implement this
21-
using uintptr_t = uint64_t;
22-
///}
16+
#include <stdint.h>
2317

2418
/// Macros for allocating variables in different address spaces.
2519
///{

offload/DeviceRTL/include/DeviceTypes.h

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,7 @@ struct TaskDescriptorTy {
100100
TaskFnTy TaskFn;
101101
};
102102

103-
#pragma omp begin declare variant match(device = {arch(amdgcn)})
104103
using LaneMaskTy = uint64_t;
105-
#pragma omp end declare variant
106-
107-
#pragma omp begin declare variant match( \
108-
device = {arch(amdgcn)}, implementation = {extension(match_none)})
109-
using LaneMaskTy = uint64_t;
110-
#pragma omp end declare variant
111104

112105
namespace lanes {
113106
enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
@@ -164,8 +157,7 @@ typedef enum omp_allocator_handle_t {
164157
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
165158

166159
#define SHARED(NAME) \
167-
NAME [[clang::loader_uninitialized]]; \
168-
OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
160+
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
169161

170162
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
171163
// now that's not the case.

offload/DeviceRTL/include/DeviceUtils.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
#include "DeviceTypes.h"
1616
#include "Shared/Utils.h"
1717

18-
#pragma omp begin declare target device_type(nohost)
19-
2018
namespace utils {
2119

2220
template <typename T> struct type_identity {
@@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr);
9593

9694
} // namespace utils
9795

98-
#pragma omp end declare target
99-
10096
#endif

offload/DeviceRTL/include/Mapping.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,8 @@ enum {
2424
DIM_Z = 2,
2525
};
2626

27-
#pragma omp begin declare target device_type(nohost)
28-
2927
inline constexpr uint32_t MaxThreadsPerTeam = 1024;
3028

31-
#pragma omp end declare target
32-
3329
/// Initialize the mapping machinery.
3430
void init(bool IsSPMD);
3531

offload/DeviceRTL/include/State.h

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
// Forward declaration.
2424
struct KernelEnvironmentTy;
2525

26-
#pragma omp begin declare target device_type(nohost)
27-
2826
namespace ompx {
2927

3028
namespace memory {
@@ -89,8 +87,7 @@ struct TeamStateTy {
8987
ParallelRegionFnTy ParallelRegionFnVar;
9088
};
9189

92-
extern TeamStateTy TeamState;
93-
#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
90+
extern TeamStateTy [[clang::address_space(3)]] TeamState;
9491

9592
struct ThreadStateTy {
9693

@@ -116,8 +113,7 @@ struct ThreadStateTy {
116113
}
117114
};
118115

119-
extern ThreadStateTy **ThreadStates;
120-
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
116+
extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
121117

122118
/// Initialize the state machinery. Must be called by all threads.
123119
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
@@ -379,6 +375,4 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;
379375

380376
} // namespace ompx
381377

382-
#pragma omp end declare target
383-
384378
#endif

offload/DeviceRTL/include/Synchronization.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
#include "DeviceTypes.h"
1616
#include "DeviceUtils.h"
1717

18-
#pragma omp begin declare target device_type(nohost)
19-
2018
namespace ompx {
2119
namespace atomic {
2220

@@ -221,6 +219,4 @@ void system(atomic::OrderingTy Ordering);
221219

222220
} // namespace ompx
223221

224-
#pragma omp end declare target
225-
226222
#endif

offload/DeviceRTL/include/Workshare.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
#ifndef OMPTARGET_WORKSHARE_H
1313
#define OMPTARGET_WORKSHARE_H
1414

15-
#pragma omp begin declare target device_type(nohost)
16-
1715
namespace ompx {
1816

1917
namespace workshare {
@@ -25,6 +23,4 @@ void init(bool IsSPMD);
2523

2624
} // namespace ompx
2725

28-
#pragma omp end declare target
29-
3026
#endif

offload/DeviceRTL/include/extra_allocators.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator = omp_null_allocator);
6161
//// Any re-definitions of malloc/free delete the native CUDA
6262
//// but they are necessary
6363
#ifdef __AMDGCN__
64-
void *malloc(uint64_t Size);
64+
void *malloc(size_t Size);
6565
void free(void *Ptr);
6666
#endif
6767
} // extern "C"

offload/DeviceRTL/src/Allocator.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919

2020
using namespace ompx;
2121

22-
#pragma omp begin declare target device_type(nohost)
23-
2422
[[gnu::used, gnu::retain, gnu::weak,
2523
gnu::visibility(
2624
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
@@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
7775
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
7876

7977
///}
80-
81-
#pragma omp end declare target

offload/DeviceRTL/src/Configuration.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717

1818
using namespace ompx;
1919

20-
#pragma omp begin declare target device_type(nohost)
21-
2220
// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
2321
[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
2422
[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
@@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() {
8583
return false;
8684
return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
8785
}
88-
89-
#pragma omp end declare target

offload/DeviceRTL/src/Debug.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
using namespace ompx;
2323

24-
#pragma omp begin declare target device_type(nohost)
25-
2624
extern "C" {
2725
void __assert_assume(bool condition) { __builtin_assume(condition); }
2826

@@ -44,5 +42,3 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
4442
__builtin_trap();
4543
}
4644
}
47-
48-
#pragma omp end declare target

offload/DeviceRTL/src/DeviceUtils.cpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
#include "Interface.h"
1616
#include "Mapping.h"
1717

18-
#pragma omp begin declare target device_type(nohost)
19-
2018
using namespace ompx;
2119

2220
extern "C" [[gnu::weak]] int IsSPMDMode;
@@ -33,8 +31,6 @@ __keep_alive() {
3331

3432
namespace impl {
3533

36-
bool isSharedMemPtr(const void *Ptr) { return false; }
37-
3834
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
3935
static_assert(sizeof(unsigned long) == 8, "");
4036
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
@@ -54,7 +50,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
5450
/// AMDGCN Implementation
5551
///
5652
///{
57-
#pragma omp begin declare variant match(device = {arch(amdgcn)})
53+
#ifdef __AMDGPU__
5854

5955
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
6056
int Self = mapping::getThreadIdInWarp();
@@ -78,15 +74,13 @@ bool isSharedMemPtr(const void *Ptr) {
7874
return __builtin_amdgcn_is_shared(
7975
(const __attribute__((address_space(0))) void *)Ptr);
8076
}
81-
#pragma omp end declare variant
77+
#endif
8278
///}
8379

8480
/// NVPTX Implementation
8581
///
8682
///{
87-
#pragma omp begin declare variant match( \
88-
device = {arch(nvptx, nvptx64)}, \
89-
implementation = {extension(match_any)})
83+
#ifdef __NVPTX__
9084

9185
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
9286
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
@@ -103,7 +97,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
10397

10498
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
10599

106-
#pragma omp end declare variant
100+
#endif
107101
///}
108102
} // namespace impl
109103

@@ -149,5 +143,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
149143
return utils::shuffleDown(lanes::All, Val, Delta, Width);
150144
}
151145
}
152-
153-
#pragma omp end declare target

offload/DeviceRTL/src/ExtraMapping.cpp

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
#include "Interface.h"
22
#include "Mapping.h"
33

4-
#pragma omp declare target
5-
64
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
75

86
using namespace ompx::mapping;
@@ -13,10 +11,8 @@ namespace impl {
1311
/// AMDGCN Implementation
1412
///
1513
///{
16-
uint32_t __kmpc_impl_smid();
17-
uint32_t getGenericModeMainThreadId();
1814

19-
#pragma omp begin declare variant match(device = {arch(amdgcn)})
15+
#ifdef __AMDGPU__
2016

2117
// Partially derived fom hcc_detail/device_functions.h
2218

@@ -75,15 +71,14 @@ static uint32_t getGenericModeMainThreadId() {
7571
return (__kmpc_get_hardware_num_threads_in_block() - 1) & (~Mask);
7672
}
7773

78-
#pragma omp end declare variant
74+
#endif
7975
///}
8076

8177
/// NVPTX Implementation
8278
///
8379
///{
8480

85-
#pragma omp begin declare variant match( \
86-
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
81+
#ifdef __NVPTX__
8782

8883
static uint32_t __kmpc_impl_smid() { return 0; }
8984

@@ -92,7 +87,7 @@ static uint32_t getGenericModeMainThreadId() {
9287
return (__kmpc_get_hardware_num_threads_in_block() - 1) & (~Mask);
9388
}
9489

95-
#pragma omp end declare variant
90+
#endif
9691
///}
9792

9893
} // namespace impl
@@ -113,7 +108,7 @@ int omp_ext_get_lane_id() {
113108
}
114109

115110
int omp_ext_get_smid() {
116-
int rc = impl::__kmpc_impl_smid();
111+
int rc = ompx::mapping::impl::__kmpc_impl_smid();
117112
return rc;
118113
}
119114

@@ -137,5 +132,3 @@ unsigned long long omp_ext_get_active_threads_mask() {
137132
}
138133

139134
} // end extern "C"
140-
141-
#pragma omp end declare target

0 commit comments

Comments
 (0)