Skip to content

Commit ab9157c

Browse files
committed
Reduction specialization llvm#70766
1 parent 7e31a37 commit ab9157c

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed

openmp/libomptarget/DeviceRTL/src/Reduction.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,116 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
175175
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
176176
}
177177

178+
/// Mostly like _v2 but with the builtin assumption that we have less than
179+
/// num_of_records (by default 1024) teams.
180+
int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
181+
IdentTy *Loc, void *__restrict__ GlobalBuffer, uint32_t num_of_records,
182+
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
183+
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
184+
ListGlobalFnTy glredFct) {
185+
// Terminate all threads in non-SPMD mode except for the main thread.
186+
uint32_t ThreadId = mapping::getThreadIdInBlock();
187+
if (mapping::isGenericMode()) {
188+
if (!mapping::isMainThreadInGenericMode())
189+
return 0;
190+
ThreadId = 0;
191+
}
192+
193+
uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
194+
195+
// In non-generic mode all workers participate in the teams reduction.
196+
// In generic mode only the team main participates in the teams
197+
// reduction because the workers are waiting for parallel work.
198+
uint32_t NumThreads = omp_get_num_threads();
199+
uint32_t TeamId = omp_get_team_num();
200+
uint32_t NumTeams = omp_get_num_teams();
201+
static unsigned SHARED(ChunkTeamCount);
202+
203+
// Block progress for teams greater than the current upper
204+
// limit. We always only allow a number of teams less or equal
205+
// to the number of slots in the buffer.
206+
bool IsMain = (ThreadId == 0);
207+
208+
if (IsMain) {
209+
lgcpyFct(GlobalBuffer, TeamId, reduce_data);
210+
211+
// Propagate the memory writes above to the world.
212+
fence::kernel(atomic::release);
213+
214+
// Increment team counter.
215+
// This counter is incremented by all teams in the current
216+
// BUFFER_SIZE chunk.
217+
ChunkTeamCount = atomic::inc(&Cnt, NumTeams - 1, atomic::seq_cst,
218+
atomic::MemScopeTy::device);
219+
}
220+
221+
// Synchronize in SPMD mode as in generic mode all but 1 threads are in the
222+
// state machine.
223+
if (mapping::isSPMDMode())
224+
synchronize::threadsAligned(atomic::acq_rel);
225+
226+
// Each thread will have a local struct containing the values to be
227+
// reduced:
228+
// 1. do reduction within each warp.
229+
// 2. do reduction across warps.
230+
// 3. write the final result to the main reduction variable
231+
// by returning 1 in the thread holding the reduction result.
232+
233+
// Check if this is the very last team.
234+
if (ChunkTeamCount != NumTeams - 1)
235+
return 0;
236+
237+
if (ThreadId >= NumTeams)
238+
return 0;
239+
240+
// Last team processing.
241+
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
242+
if (ThreadId >= NumThreads)
243+
return 0;
244+
245+
// Ensure we see the global memory writes by other teams
246+
fence::kernel(atomic::aquire);
247+
248+
// Load from buffer and reduce.
249+
glcpyFct(GlobalBuffer, ThreadId, reduce_data);
250+
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
251+
glredFct(GlobalBuffer, i, reduce_data);
252+
253+
// Reduce across warps to the warp main.
254+
if (NumThreads > 1)
255+
gpu_regular_warp_reduce(reduce_data, shflFct);
256+
257+
uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
258+
uint32_t WarpsNeeded =
259+
(ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
260+
if (ActiveThreads > mapping::getWarpSize()) {
261+
// Gather all the reduced values from each warp
262+
// to the first warp.
263+
cpyFct(reduce_data, WarpsNeeded);
264+
265+
if (mapping::getWarpIdInBlock() == 0)
266+
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
267+
}
268+
269+
return IsMain;
270+
}
271+
178272
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
179273
IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
180274
uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
181275
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
182276
ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
277+
278+
// The first check is a compile time constant, the second one a runtime check.
279+
// If the first one succeeds we will use the specialized version.
280+
if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
281+
state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
282+
num_of_records == 1024) ||
283+
(omp_get_num_teams() <= num_of_records))
284+
return __kmpc_nvptx_teams_reduce_nowait_v3(
285+
Loc, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
286+
lgcpyFct, lgredFct, glcpyFct, glredFct);
287+
183288
// Terminate all threads in non-SPMD mode except for the master thread.
184289
uint32_t ThreadId = mapping::getThreadIdInBlock();
185290
if (mapping::isGenericMode()) {

openmp/libomptarget/test/api/omp_device_managed_memory.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,5 @@ int main() {
2626
// CHECK: PASS
2727
if (sum == N)
2828
printf("PASS\n");
29+
printf("%i : %i\n", sum, N);
2930
}

0 commit comments

Comments
 (0)