Skip to content

Commit ae1446b

Browse files
authored
Revert "[OpenMP] Simplify parallel reductions (#70983)"
This reverts commit e9a48f9.
1 parent fbdf6e2 commit ae1446b

File tree

2 files changed

+98
-47
lines changed

2 files changed

+98
-47
lines changed

openmp/libomptarget/DeviceRTL/src/Reduction.cpp

Lines changed: 98 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -44,45 +44,119 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
4444
}
4545
}
4646

47-
static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
47+
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
48+
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
49+
ShuffleReductFnTy shflFct) {
50+
uint32_t size, remote_id, physical_lane_id;
51+
physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
52+
__kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
53+
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
54+
uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
55+
__kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
56+
do {
57+
Liveness = mapping::activemask();
58+
remote_id = utils::ffs(Liveness & lanemask_gt);
59+
size = utils::popc(Liveness);
60+
logical_lane_id /= 2;
61+
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
62+
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
63+
} while (logical_lane_id % 2 == 0 && size > 1);
64+
return (logical_lane_id == 0);
65+
}
66+
#endif
67+
68+
static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
69+
uint64_t reduce_size,
70+
void *reduce_data,
4871
ShuffleReductFnTy shflFct,
49-
InterWarpCopyFnTy cpyFct) {
72+
InterWarpCopyFnTy cpyFct,
73+
bool isSPMDExecutionMode, bool) {
74+
uint32_t BlockThreadId = mapping::getThreadIdInBlock();
75+
if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
76+
BlockThreadId = 0;
5077
uint32_t NumThreads = omp_get_num_threads();
51-
// Handle degenerated parallel regions, including all nested ones, first.
5278
if (NumThreads == 1)
5379
return 1;
54-
55-
/*
56-
* 1. Reduce within a warp.
57-
* 2. Warp master copies value to warp 0 via shared memory.
58-
* 3. Warp 0 reduces to a single value.
59-
* 4. The reduced value is available in the thread that returns 1.
60-
*/
61-
62-
uint32_t BlockThreadId = mapping::getThreadIdInBlock();
63-
uint32_t NumWarps =
80+
/*
81+
* This reduce function handles reduction within a team. It handles
82+
* parallel regions in both L1 and L2 parallelism levels. It also
83+
* supports Generic, SPMD, and NoOMP modes.
84+
*
85+
* 1. Reduce within a warp.
86+
* 2. Warp master copies value to warp 0 via shared memory.
87+
* 3. Warp 0 reduces to a single value.
88+
* 4. The reduced value is available in the thread that returns 1.
89+
*/
90+
91+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
92+
uint32_t WarpsNeeded =
6493
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
94+
uint32_t WarpId = mapping::getWarpIdInBlock();
6595

96+
// Volta execution model:
6697
// For the Generic execution mode a parallel region either has 1 thread and
6798
// beyond that, always a multiple of 32. For the SPMD execution mode we may
6899
// have any number of threads.
69-
gpu_regular_warp_reduce(reduce_data, shflFct);
100+
if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
101+
gpu_regular_warp_reduce(reduce_data, shflFct);
102+
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
103+
gpu_irregular_warp_reduce(reduce_data, shflFct,
104+
/*LaneCount=*/NumThreads % mapping::getWarpSize(),
105+
/*LaneId=*/mapping::getThreadIdInBlock() %
106+
mapping::getWarpSize());
70107

71108
// When we have more than [mapping::getWarpSize()] number of threads
72109
// a block reduction is performed here.
110+
//
111+
// Only L1 parallel region can enter this if condition.
73112
if (NumThreads > mapping::getWarpSize()) {
74113
// Gather all the reduced values from each warp
75114
// to the first warp.
76-
cpyFct(reduce_data, NumWarps);
115+
cpyFct(reduce_data, WarpsNeeded);
77116

78-
if (BlockThreadId < mapping::getWarpSize())
79-
gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId);
117+
if (WarpId == 0)
118+
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
119+
BlockThreadId);
80120
}
81-
82-
// In Generic and in SPMD mode block thread Id 0 is what we want.
83-
// It's either the main thread in SPMD mode or the "acting" main thread in the
84-
// parallel region.
85121
return BlockThreadId == 0;
122+
#else
123+
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
124+
if (Liveness == lanes::All) // Full warp
125+
gpu_regular_warp_reduce(reduce_data, shflFct);
126+
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
127+
gpu_irregular_warp_reduce(reduce_data, shflFct,
128+
/*LaneCount=*/utils::popc(Liveness),
129+
/*LaneId=*/mapping::getThreadIdInBlock() %
130+
mapping::getWarpSize());
131+
else { // Dispersed lanes. Only threads in L2
132+
// parallel region may enter here; return
133+
// early.
134+
return gpu_irregular_simd_reduce(reduce_data, shflFct);
135+
}
136+
137+
// When we have more than [mapping::getWarpSize()] number of threads
138+
// a block reduction is performed here.
139+
//
140+
// Only L1 parallel region can enter this if condition.
141+
if (NumThreads > mapping::getWarpSize()) {
142+
uint32_t WarpsNeeded =
143+
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
144+
// Gather all the reduced values from each warp
145+
// to the first warp.
146+
cpyFct(reduce_data, WarpsNeeded);
147+
148+
uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
149+
if (WarpId == 0)
150+
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
151+
BlockThreadId);
152+
153+
return BlockThreadId == 0;
154+
}
155+
156+
// Get the OMP thread Id. This is different from BlockThreadId in the case of
157+
// an L2 parallel region.
158+
return TId == 0;
159+
#endif // __CUDA_ARCH__ >= 700
86160
}
87161

88162
uint32_t roundToWarpsize(uint32_t s) {
@@ -99,7 +173,9 @@ extern "C" {
99173
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
100174
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
101175
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
102-
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
176+
return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
177+
shflFct, cpyFct, mapping::isSPMDMode(),
178+
false);
103179
}
104180

105181
/// Mostly like _v2 but with the builtin assumption that we have less than

openmp/libomptarget/test/offloading/generic_reduction.c

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)