Skip to content

Commit d12ee28

Browse files
committed
[OpenMP] Simplify the ThreadStackTy for globalization fallback
With D106496 we can make the globalization fallback stack much simpler and this version doesn't seem to experience the spurious failures and deadlocks we have seen before. Differential Revision: https://reviews.llvm.org/D106576
1 parent 6ca9693 commit d12ee28

File tree

1 file changed

+31
-75
lines changed

1 file changed

+31
-75
lines changed

openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu

Lines changed: 31 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -21,114 +21,70 @@
2121

2222
static constexpr unsigned MinBytes = 8;
2323

24-
template <unsigned BytesPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
24+
template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
2525
struct alignas(32) ThreadStackTy {
26-
static constexpr unsigned MaxSize = NThreads * BytesPerThread;
26+
static constexpr unsigned BytesPerThread = BPerThread;
2727
static constexpr unsigned NumThreads = NThreads;
2828
static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
29-
static constexpr unsigned MaxSizePerWarp = MaxSize / NumWarps;
3029

31-
unsigned char Data[MaxSize];
32-
char Sizes[MaxSize / MinBytes];
33-
char SizeUsage[NumWarps];
34-
char Usage[NumWarps];
30+
unsigned char Data[NumThreads][BytesPerThread];
31+
unsigned char Usage[NumThreads];
3532
};
3633

3734
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
3835
#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
3936

40-
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 2,
41-
MAX_THREADS_PER_TEAM / 8>
37+
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
38+
MAX_THREADS_PER_TEAM / 4>
4239
WorkerSharedStack;
4340
#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
4441

45-
template <typename AllocTy>
46-
static void *__kmpc_alloc_for_warp(AllocTy Alloc, unsigned Bytes,
47-
unsigned WarpBytes) {
48-
void *Ptr;
49-
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
50-
unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
51-
bool IsWarpLeader =
52-
(__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID;
53-
if (IsWarpLeader)
54-
Ptr = Alloc();
55-
// Get address from the first active lane.
56-
int *FP = (int *)&Ptr;
57-
FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], LeaderID);
58-
if (sizeof(Ptr) == 8)
59-
FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], LeaderID);
60-
return (void *)&((char *)(Ptr))[(GetLaneId() - LeaderID) * Bytes];
61-
}
62-
6342
EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
64-
Bytes = Bytes + (Bytes % MinBytes);
43+
size_t AlignedBytes = Bytes + (Bytes % MinBytes);
6544
int TID = __kmpc_get_hardware_thread_id_in_block();
6645
if (__kmpc_is_generic_main_thread(TID)) {
6746
// Main thread alone, use shared memory if space is available.
68-
if (MainSharedStack.Usage[0] + Bytes <= MainSharedStack.MaxSize) {
69-
void *Ptr = &MainSharedStack.Data[MainSharedStack.Usage[0]];
70-
MainSharedStack.Usage[0] += Bytes;
71-
MainSharedStack.Sizes[MainSharedStack.SizeUsage[0]++] = Bytes;
47+
if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
48+
void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
49+
MainSharedStack.Usage[0] += AlignedBytes;
7250
return Ptr;
7351
}
74-
} else {
75-
int WID = GetWarpId();
76-
unsigned WarpBytes = Bytes * WARPSIZE;
77-
auto AllocSharedStack = [&]() {
78-
unsigned WarpOffset = WID * WorkerSharedStack.MaxSizePerWarp;
79-
void *Ptr =
80-
&WorkerSharedStack.Data[WarpOffset + WorkerSharedStack.Usage[WID]];
81-
WorkerSharedStack.Usage[WID] += WarpBytes;
82-
WorkerSharedStack.Sizes[WorkerSharedStack.SizeUsage[WID]++] = WarpBytes;
52+
} else if (TID < WorkerSharedStack.NumThreads) {
53+
if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
54+
void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
55+
WorkerSharedStack.Usage[TID] += AlignedBytes;
8356
return Ptr;
84-
};
85-
if (TID < WorkerSharedStack.NumThreads &&
86-
WorkerSharedStack.Usage[WID] + WarpBytes <=
87-
WorkerSharedStack.MaxSizePerWarp)
88-
return __kmpc_alloc_for_warp(AllocSharedStack, Bytes, WarpBytes);
57+
}
8958
}
9059
// Fallback to malloc
91-
unsigned WarpBytes = Bytes * WARPSIZE;
92-
auto AllocGlobal = [&] {
93-
return SafeMalloc(WarpBytes, "AllocGlobalFallback");
94-
};
95-
return __kmpc_alloc_for_warp(AllocGlobal, Bytes, WarpBytes);
60+
return SafeMalloc(Bytes, "AllocGlobalFallback");
9661
}
9762

98-
EXTERN void __kmpc_free_shared(void *Ptr, size_t /* Bytes */) {
99-
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
100-
unsigned LeaderID = __kmpc_impl_ffs(CurActive) - 1;
101-
bool IsWarpLeader =
102-
(__kmpc_get_hardware_thread_id_in_block() % WARPSIZE) == LeaderID;
103-
__kmpc_syncwarp(CurActive);
104-
if (IsWarpLeader) {
105-
if (Ptr >= &MainSharedStack.Data[0] &&
106-
Ptr < &MainSharedStack.Data[MainSharedStack.MaxSize]) {
107-
unsigned Bytes = MainSharedStack.Sizes[--MainSharedStack.SizeUsage[0]];
108-
MainSharedStack.Usage[0] -= Bytes;
63+
EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
64+
size_t AlignedBytes = Bytes + (Bytes % MinBytes);
65+
int TID = __kmpc_get_hardware_thread_id_in_block();
66+
if (__kmpc_is_generic_main_thread(TID)) {
67+
if (Ptr >= &MainSharedStack.Data[0][0] &&
68+
Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
69+
MainSharedStack.Usage[0] -= AlignedBytes;
10970
return;
11071
}
111-
if (Ptr >= &WorkerSharedStack.Data[0] &&
112-
Ptr < &WorkerSharedStack.Data[WorkerSharedStack.MaxSize]) {
113-
int WID = GetWarpId();
114-
unsigned Bytes =
115-
WorkerSharedStack.Sizes[--WorkerSharedStack.SizeUsage[WID]];
116-
WorkerSharedStack.Usage[WID] -= Bytes;
72+
} else if (TID < WorkerSharedStack.NumThreads) {
73+
if (Ptr >= &WorkerSharedStack.Data[0][0] &&
74+
Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
75+
int TID = __kmpc_get_hardware_thread_id_in_block();
76+
WorkerSharedStack.Usage[TID] -= AlignedBytes;
11777
return;
11878
}
119-
SafeFree(Ptr, "FreeGlobalFallback");
12079
}
80+
SafeFree(Ptr, "FreeGlobalFallback");
12181
}
12282

12383
EXTERN void __kmpc_data_sharing_init_stack() {
124-
for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i) {
125-
MainSharedStack.SizeUsage[i] = 0;
84+
for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
12685
MainSharedStack.Usage[i] = 0;
127-
}
128-
for (unsigned i = 0; i < WorkerSharedStack.NumWarps; ++i) {
129-
WorkerSharedStack.SizeUsage[i] = 0;
86+
for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
13087
WorkerSharedStack.Usage[i] = 0;
131-
}
13288
}
13389

13490
/// Allocate storage in shared memory to communicate arguments from the main

0 commit comments

Comments
 (0)