Skip to content

Commit 07441d5

Browse files
committed
Revert "[OpenMP] Provide a specialized team reduction for the common case (llvm#70766)"
fails 4 sollve tests: test_target_teams_distribute_reduction_and.c test_target_teams_distribute_reduction_multiply.c test_target_teams_distribute_reduction_and.c test_target_teams_distribute_reduction_multiply.c This reverts commit eab828d. Change-Id: If6beb31e12531c9232ccf9a711fbb2a1cbe99898
1 parent d2a70a9 commit 07441d5

File tree

2 files changed

+8
-98
lines changed

2 files changed

+8
-98
lines changed

openmp/libomptarget/DeviceRTL/src/Reduction.cpp

Lines changed: 0 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -178,109 +178,11 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
178178
false);
179179
}
180180

181-
/// Mostly like _v2 but with the builtin assumption that we have less than
182-
/// num_of_records (by default 1024) teams.
183-
int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
184-
IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
185-
uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
186-
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
187-
ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
188-
// Terminate all threads in non-SPMD mode except for the main thread.
189-
uint32_t ThreadId = mapping::getThreadIdInBlock();
190-
if (mapping::isGenericMode()) {
191-
if (!mapping::isMainThreadInGenericMode())
192-
return 0;
193-
ThreadId = 0;
194-
}
195-
196-
uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
197-
198-
// In non-generic mode all workers participate in the teams reduction.
199-
// In generic mode only the team main participates in the teams
200-
// reduction because the workers are waiting for parallel work.
201-
uint32_t NumThreads = omp_get_num_threads();
202-
uint32_t TeamId = omp_get_team_num();
203-
uint32_t NumTeams = omp_get_num_teams();
204-
static unsigned SHARED(ChunkTeamCount);
205-
206-
// Block progress for teams greater than the current upper
207-
// limit. We always only allow a number of teams less or equal
208-
// to the number of slots in the buffer.
209-
bool IsMain = (ThreadId == 0);
210-
211-
if (IsMain) {
212-
lgcpyFct(GlobalBuffer, TeamId, reduce_data);
213-
214-
// Propagate the memory writes above to the world.
215-
fence::kernel(atomic::release);
216-
217-
// Increment team counter.
218-
// This counter is incremented by all teams in the current
219-
// BUFFER_SIZE chunk.
220-
ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
221-
atomic::MemScopeTy::device);
222-
}
223-
224-
// Synchronize in SPMD mode as in generic mode all but 1 threads are in the
225-
// state machine.
226-
if (mapping::isSPMDMode())
227-
synchronize::threadsAligned(atomic::acq_rel);
228-
229-
// Each thread will have a local struct containing the values to be
230-
// reduced:
231-
// 1. do reduction within each warp.
232-
// 2. do reduction across warps.
233-
// 3. write the final result to the main reduction variable
234-
// by returning 1 in the thread holding the reduction result.
235-
236-
// Check if this is the very last team.
237-
if (ChunkTeamCount != NumTeams - 1)
238-
return 0;
239-
240-
// Last team processing.
241-
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
242-
if (ThreadId >= NumThreads)
243-
return 0;
244-
245-
// Ensure we see the global memory writes by other teams
246-
fence::kernel(atomic::aquire);
247-
248-
// Load from buffer and reduce.
249-
glcpyFct(GlobalBuffer, ThreadId, reduce_data);
250-
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
251-
glredFct(GlobalBuffer, i, reduce_data);
252-
253-
// Reduce across warps to the warp main.
254-
gpu_regular_warp_reduce(reduce_data, shflFct);
255-
256-
uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
257-
uint32_t WarpsNeeded =
258-
(ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
259-
// Gather all the reduced values from each warp
260-
// to the first warp.
261-
cpyFct(reduce_data, WarpsNeeded);
262-
263-
if (mapping::getWarpIdInBlock() == 0)
264-
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
265-
266-
return IsMain;
267-
}
268-
269181
int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
270182
IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
271183
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
272184
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
273185
ListGlobalFnTy glredFct) {
274-
// The first check is a compile time constant, the second one a runtime check.
275-
// If the first one succeeds we will use the specialized version.
276-
if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
277-
state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
278-
num_of_records == 1024) ||
279-
(omp_get_num_teams() <= num_of_records))
280-
return __kmpc_nvptx_teams_reduce_nowait_v3(
281-
Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
282-
lgcpyFct, lgredFct, glcpyFct, glredFct);
283-
284186
// Terminate all threads in non-SPMD mode except for the master thread.
285187
uint32_t ThreadId = mapping::getThreadIdInBlock();
286188
if (mapping::isGenericMode()) {

revert_patches.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,11 @@ Johannes: revert breaks 3 sollve tests
2020
test_loop_reduction_and_device.c
2121
test_loop_reduction_bitand_device.c
2222
test_loop_reduction_multiply_device.c
23+
24+
Johannes: breaks 4 sollve tests
25+
[OpenMP] Provide a specialized team reduction for the common case (#70766)
26+
test_target_teams_distribute_reduction_and.c
27+
test_target_teams_distribute_reduction_multiply.c
28+
test_target_teams_distribute_reduction_and.c
29+
test_target_teams_distribute_reduction_multiply.c
30+

0 commit comments

Comments
 (0)