Skip to content

Commit ed9107f

Browse files
authored
[OpenMP] Replace use of target address space with <gpuintrin.h> local (#126119)
Summary: This definition is more portable since it defines the correct value for the target. I got rid of the helper mostly because I think it's easy enough to use now that it's a type and being explicit about what's `undef` or `poison` is good.
1 parent 70906f0 commit ed9107f

File tree

8 files changed

+43
-44
lines changed

8 files changed

+43
-44
lines changed

offload/DeviceRTL/include/DeviceTypes.h

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,15 @@
1212
#ifndef OMPTARGET_TYPES_H
1313
#define OMPTARGET_TYPES_H
1414

15+
#include <gpuintrin.h>
1516
#include <stddef.h>
1617
#include <stdint.h>
1718

19+
template <typename T> using Private = __gpu_private T;
20+
template <typename T> using Constant = __gpu_constant T;
21+
template <typename T> using Local = __gpu_local T;
22+
template <typename T> using Global = __gpu_local T;
23+
1824
enum omp_proc_bind_t {
1925
omp_proc_bind_false = 0,
2026
omp_proc_bind_true = 1,
@@ -155,19 +161,6 @@ typedef enum omp_allocator_handle_t {
155161
#define __PRAGMA(STR) _Pragma(#STR)
156162
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
157163

158-
#define SHARED(NAME) \
159-
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
160-
161-
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
162-
// now that's not the case.
163-
#define THREAD_LOCAL(NAME) \
164-
[[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
165-
166-
// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
167-
// does?
168-
#define CONSTANT(NAME) \
169-
[[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
170-
171164
///}
172165

173166
#endif

offload/DeviceRTL/include/State.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ struct TeamStateTy {
8686
ParallelRegionFnTy ParallelRegionFnVar;
8787
};
8888

89-
extern TeamStateTy [[clang::address_space(3)]] TeamState;
89+
extern Local<TeamStateTy> TeamState;
9090

9191
struct ThreadStateTy {
9292

@@ -112,7 +112,7 @@ struct ThreadStateTy {
112112
}
113113
};
114114

115-
extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
115+
extern Local<ThreadStateTy **> ThreadStates;
116116

117117
/// Initialize the state machinery. Must be called by all threads.
118118
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,

offload/DeviceRTL/src/Configuration.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ using namespace ompx;
2828
// This variable should be visible to the plugin so we override the default
2929
// hidden visibility.
3030
[[gnu::used, gnu::retain, gnu::weak,
31-
gnu::visibility("protected")]] DeviceEnvironmentTy
32-
CONSTANT(__omp_rtl_device_environment);
31+
gnu::visibility(
32+
"protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment;
3333

3434
uint32_t config::getAssumeTeamsOversubscription() {
3535
return __omp_rtl_assume_teams_oversubscription;

offload/DeviceRTL/src/Mapping.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
308308

309309
// TODO: This is a workaround for initialization coming from kernels outside of
310310
// the TU. We will need to solve this more correctly in the future.
311-
[[gnu::weak]] int SHARED(IsSPMDMode);
311+
[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
312312

313313
void mapping::init(bool IsSPMD) {
314314
if (mapping::isInitialThreadInLevel0(IsSPMD))

offload/DeviceRTL/src/Reduction.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,16 +71,16 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
7171
if (NumThreads == 1)
7272
return 1;
7373

74-
//
75-
// This reduce function handles reduction within a team. It handles
76-
// parallel regions in both L1 and L2 parallelism levels. It also
77-
// supports Generic, SPMD, and NoOMP modes.
78-
//
79-
// 1. Reduce within a warp.
80-
// 2. Warp master copies value to warp 0 via shared memory.
81-
// 3. Warp 0 reduces to a single value.
82-
// 4. The reduced value is available in the thread that returns 1.
83-
//
74+
//
75+
// This reduce function handles reduction within a team. It handles
76+
// parallel regions in both L1 and L2 parallelism levels. It also
77+
// supports Generic, SPMD, and NoOMP modes.
78+
//
79+
// 1. Reduce within a warp.
80+
// 2. Warp master copies value to warp 0 via shared memory.
81+
// 3. Warp 0 reduces to a single value.
82+
// 4. The reduced value is available in the thread that returns 1.
83+
//
8484

8585
#if __has_builtin(__nvvm_reflect)
8686
if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
@@ -196,8 +196,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
196196
uint32_t NumThreads = omp_get_num_threads();
197197
uint32_t TeamId = omp_get_team_num();
198198
uint32_t NumTeams = omp_get_num_teams();
199-
static unsigned SHARED(Bound);
200-
static unsigned SHARED(ChunkTeamCount);
199+
[[clang::loader_uninitialized]] static Local<unsigned> Bound;
200+
[[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
201201

202202
// Block progress for teams greater than the current upper
203203
// limit. We always only allow a number of teams less or equal

offload/DeviceRTL/src/State.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,17 @@ using namespace ompx;
2828
///{
2929

3030
/// External symbol to access dynamic shared memory.
31-
[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char
32-
[[clang::address_space(3)]] DynamicSharedBuffer[];
31+
[[gnu::aligned(
32+
allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
3333

3434
/// The kernel environment passed to the init method by the compiler.
35-
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
35+
[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
36+
KernelEnvironmentPtr;
3637

3738
/// The kernel launch environment passed as argument to the kernel by the
3839
/// runtime.
39-
static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
40+
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
41+
KernelLaunchEnvironmentPtr;
4042

4143
///}
4244

@@ -108,7 +110,8 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
108110
"Shared scratchpad of this size not supported yet.");
109111

110112
/// The allocation of a single shared memory scratchpad.
111-
static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
113+
[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
114+
SharedMemorySmartStack;
112115

113116
void SharedMemorySmartStackTy::init(bool IsSPMD) {
114117
Usage[mapping::getThreadIdInBlock()] = 0;
@@ -220,8 +223,10 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
220223
ASSERT(HasThreadState == Other.HasThreadState, nullptr);
221224
}
222225

223-
state::TeamStateTy SHARED(ompx::state::TeamState);
224-
state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
226+
[[clang::loader_uninitialized]] Local<state::TeamStateTy>
227+
ompx::state::TeamState;
228+
[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
229+
ompx::state::ThreadStates;
225230

226231
namespace {
227232

@@ -449,10 +454,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
449454
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
450455
constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
451456

452-
[[clang::loader_uninitialized]] static void *[[clang::address_space(
453-
3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
454-
[[clang::loader_uninitialized]] static void **[[clang::address_space(
455-
3)]] SharedMemVariableSharingSpacePtr;
457+
[[clang::loader_uninitialized]] static Local<void *>
458+
SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
459+
[[clang::loader_uninitialized]] static Local<void **>
460+
SharedMemVariableSharingSpacePtr;
456461

457462
void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
458463
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {

offload/DeviceRTL/src/Synchronization.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
6969
}
7070
}
7171

72-
uint32_t SHARED(namedBarrierTracker);
72+
[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
7373

7474
void namedBarrierInit() {
7575
// Don't have global ctors, and shared memory is not zero init

offload/DeviceRTL/src/Workshare.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ struct DynamicScheduleTracker {
4545
#define LAST_CHUNK 2
4646

4747
// TODO: This variable is a hack inherited from the old runtime.
48-
static uint64_t SHARED(Cnt);
48+
[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
4949

5050
template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
5151
////////////////////////////////////////////////////////////////////////////////
@@ -457,7 +457,8 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
457457
//
458458
// __kmpc_dispatch_deinit
459459
//
460-
static DynamicScheduleTracker **SHARED(ThreadDST);
460+
[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
461+
ThreadDST;
461462

462463
// Create a new DST, link the current one, and define the new as current.
463464
static DynamicScheduleTracker *pushDST() {

0 commit comments

Comments
 (0)