Skip to content

Commit e9a48f9

Browse files
authored
[OpenMP] Simplify parallel reductions (llvm#70983)
A lot of the code was from a time when we had multiple parallel levels. The new runtime is much simpler, the code can be simplified a lot which should speed up reductions too.
1 parent eab828d commit e9a48f9

File tree

2 files changed

+47
-98
lines changed

2 files changed

+47
-98
lines changed

openmp/libomptarget/DeviceRTL/src/Reduction.cpp

Lines changed: 22 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -44,119 +44,45 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
4444
}
4545
}
4646

47-
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
48-
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
49-
ShuffleReductFnTy shflFct) {
50-
uint32_t size, remote_id, physical_lane_id;
51-
physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
52-
__kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
53-
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
54-
uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
55-
__kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
56-
do {
57-
Liveness = mapping::activemask();
58-
remote_id = utils::ffs(Liveness & lanemask_gt);
59-
size = utils::popc(Liveness);
60-
logical_lane_id /= 2;
61-
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
62-
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
63-
} while (logical_lane_id % 2 == 0 && size > 1);
64-
return (logical_lane_id == 0);
65-
}
66-
#endif
67-
68-
static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
69-
uint64_t reduce_size,
70-
void *reduce_data,
47+
static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
7148
ShuffleReductFnTy shflFct,
72-
InterWarpCopyFnTy cpyFct,
73-
bool isSPMDExecutionMode, bool) {
74-
uint32_t BlockThreadId = mapping::getThreadIdInBlock();
75-
if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
76-
BlockThreadId = 0;
49+
InterWarpCopyFnTy cpyFct) {
7750
uint32_t NumThreads = omp_get_num_threads();
51+
// Handle degenerated parallel regions, including all nested ones, first.
7852
if (NumThreads == 1)
7953
return 1;
80-
/*
81-
* This reduce function handles reduction within a team. It handles
82-
* parallel regions in both L1 and L2 parallelism levels. It also
83-
* supports Generic, SPMD, and NoOMP modes.
84-
*
85-
* 1. Reduce within a warp.
86-
* 2. Warp master copies value to warp 0 via shared memory.
87-
* 3. Warp 0 reduces to a single value.
88-
* 4. The reduced value is available in the thread that returns 1.
89-
*/
90-
91-
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
92-
uint32_t WarpsNeeded =
54+
55+
/*
56+
* 1. Reduce within a warp.
57+
* 2. Warp master copies value to warp 0 via shared memory.
58+
* 3. Warp 0 reduces to a single value.
59+
* 4. The reduced value is available in the thread that returns 1.
60+
*/
61+
62+
uint32_t BlockThreadId = mapping::getThreadIdInBlock();
63+
uint32_t NumWarps =
9364
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
94-
uint32_t WarpId = mapping::getWarpIdInBlock();
9565

96-
// Volta execution model:
9766
// For the Generic execution mode a parallel region either has 1 thread and
9867
// beyond that, always a multiple of 32. For the SPMD execution mode we may
9968
// have any number of threads.
100-
if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
101-
gpu_regular_warp_reduce(reduce_data, shflFct);
102-
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
103-
gpu_irregular_warp_reduce(reduce_data, shflFct,
104-
/*LaneCount=*/NumThreads % mapping::getWarpSize(),
105-
/*LaneId=*/mapping::getThreadIdInBlock() %
106-
mapping::getWarpSize());
107-
108-
// When we have more than [mapping::getWarpSize()] number of threads
109-
// a block reduction is performed here.
110-
//
111-
// Only L1 parallel region can enter this if condition.
112-
if (NumThreads > mapping::getWarpSize()) {
113-
// Gather all the reduced values from each warp
114-
// to the first warp.
115-
cpyFct(reduce_data, WarpsNeeded);
116-
117-
if (WarpId == 0)
118-
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
119-
BlockThreadId);
120-
}
121-
return BlockThreadId == 0;
122-
#else
123-
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
124-
if (Liveness == lanes::All) // Full warp
125-
gpu_regular_warp_reduce(reduce_data, shflFct);
126-
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
127-
gpu_irregular_warp_reduce(reduce_data, shflFct,
128-
/*LaneCount=*/utils::popc(Liveness),
129-
/*LaneId=*/mapping::getThreadIdInBlock() %
130-
mapping::getWarpSize());
131-
else { // Dispersed lanes. Only threads in L2
132-
// parallel region may enter here; return
133-
// early.
134-
return gpu_irregular_simd_reduce(reduce_data, shflFct);
135-
}
69+
gpu_regular_warp_reduce(reduce_data, shflFct);
13670

13771
// When we have more than [mapping::getWarpSize()] number of threads
13872
// a block reduction is performed here.
139-
//
140-
// Only L1 parallel region can enter this if condition.
14173
if (NumThreads > mapping::getWarpSize()) {
142-
uint32_t WarpsNeeded =
143-
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
14474
// Gather all the reduced values from each warp
14575
// to the first warp.
146-
cpyFct(reduce_data, WarpsNeeded);
76+
cpyFct(reduce_data, NumWarps);
14777

148-
uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
149-
if (WarpId == 0)
150-
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
151-
BlockThreadId);
152-
153-
return BlockThreadId == 0;
78+
if (BlockThreadId < mapping::getWarpSize())
79+
gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId);
15480
}
15581

156-
// Get the OMP thread Id. This is different from BlockThreadId in the case of
157-
// an L2 parallel region.
158-
return TId == 0;
159-
#endif // __CUDA_ARCH__ >= 700
82+
// In Generic and in SPMD mode block thread Id 0 is what we want.
83+
// It's either the main thread in SPMD mode or the "acting" main thread in the
84+
// parallel region.
85+
return BlockThreadId == 0;
16086
}
16187

16288
uint32_t roundToWarpsize(uint32_t s) {
@@ -173,9 +99,7 @@ extern "C" {
17399
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
174100
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
175101
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
176-
return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
177-
shflFct, cpyFct, mapping::isSPMDMode(),
178-
false);
102+
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
179103
}
180104

181105
/// Mostly like _v2 but with the builtin assumption that we have less than
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// RUN: %libomptarget-compilexx-run-and-check-generic
2+
// RUN: %libomptarget-compileoptxx-run-and-check-generic
3+
4+
#include <omp.h>
5+
#include <stdio.h>
6+
__attribute__((optnone)) void optnone(void) {}
7+
8+
int main() {
9+
int sum = 0, nt;
10+
#pragma omp target teams map(tofrom : sum, nt) num_teams(1)
11+
{
12+
nt = 3 * omp_get_max_threads();
13+
optnone();
14+
#pragma omp parallel reduction(+ : sum)
15+
sum += 1;
16+
#pragma omp parallel reduction(+ : sum)
17+
sum += 1;
18+
#pragma omp parallel reduction(+ : sum)
19+
sum += 1;
20+
}
21+
// CHECK: nt: [[NT:.*]]
22+
// CHECK: sum: [[NT]]
23+
printf("nt: %i\n", nt);
24+
printf("sum: %i\n", sum);
25+
}

0 commit comments

Comments
 (0)