@@ -44,119 +44,45 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
44
44
}
45
45
}
46
46
47
- #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
48
- static uint32_t gpu_irregular_simd_reduce (void *reduce_data,
49
- ShuffleReductFnTy shflFct) {
50
- uint32_t size, remote_id, physical_lane_id;
51
- physical_lane_id = mapping::getThreadIdInBlock () % mapping::getWarpSize ();
52
- __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT ();
53
- __kmpc_impl_lanemask_t Liveness = mapping::activemask ();
54
- uint32_t logical_lane_id = utils::popc (Liveness & lanemask_lt) * 2 ;
55
- __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT ();
56
- do {
57
- Liveness = mapping::activemask ();
58
- remote_id = utils::ffs (Liveness & lanemask_gt);
59
- size = utils::popc (Liveness);
60
- logical_lane_id /= 2 ;
61
- shflFct (reduce_data, /* LaneId =*/ logical_lane_id,
62
- /* Offset=*/ remote_id - 1 - physical_lane_id, /* AlgoVersion=*/ 2 );
63
- } while (logical_lane_id % 2 == 0 && size > 1 );
64
- return (logical_lane_id == 0 );
65
- }
66
- #endif
67
-
68
- static int32_t nvptx_parallel_reduce_nowait (int32_t TId, int32_t num_vars,
69
- uint64_t reduce_size,
70
- void *reduce_data,
47
+ static int32_t nvptx_parallel_reduce_nowait (void *reduce_data,
71
48
ShuffleReductFnTy shflFct,
72
- InterWarpCopyFnTy cpyFct,
73
- bool isSPMDExecutionMode, bool ) {
74
- uint32_t BlockThreadId = mapping::getThreadIdInBlock ();
75
- if (mapping::isMainThreadInGenericMode (/* IsSPMD */ false ))
76
- BlockThreadId = 0 ;
49
+ InterWarpCopyFnTy cpyFct) {
77
50
uint32_t NumThreads = omp_get_num_threads ();
51
+ // Handle degenerated parallel regions, including all nested ones, first.
78
52
if (NumThreads == 1 )
79
53
return 1 ;
80
- /*
81
- * This reduce function handles reduction within a team. It handles
82
- * parallel regions in both L1 and L2 parallelism levels. It also
83
- * supports Generic, SPMD, and NoOMP modes.
84
- *
85
- * 1. Reduce within a warp.
86
- * 2. Warp master copies value to warp 0 via shared memory.
87
- * 3. Warp 0 reduces to a single value.
88
- * 4. The reduced value is available in the thread that returns 1.
89
- */
90
-
91
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
92
- uint32_t WarpsNeeded =
54
+
55
+ /*
56
+ * 1. Reduce within a warp.
57
+ * 2. Warp master copies value to warp 0 via shared memory.
58
+ * 3. Warp 0 reduces to a single value.
59
+ * 4. The reduced value is available in the thread that returns 1.
60
+ */
61
+
62
+ uint32_t BlockThreadId = mapping::getThreadIdInBlock ();
63
+ uint32_t NumWarps =
93
64
(NumThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
94
- uint32_t WarpId = mapping::getWarpIdInBlock ();
95
65
96
- // Volta execution model:
97
66
// For the Generic execution mode a parallel region either has 1 thread and
98
67
// beyond that, always a multiple of 32. For the SPMD execution mode we may
99
68
// have any number of threads.
100
- if ((NumThreads % mapping::getWarpSize () == 0 ) || (WarpId < WarpsNeeded - 1 ))
101
- gpu_regular_warp_reduce (reduce_data, shflFct);
102
- else if (NumThreads > 1 ) // Only SPMD execution mode comes thru this case.
103
- gpu_irregular_warp_reduce (reduce_data, shflFct,
104
- /* LaneCount=*/ NumThreads % mapping::getWarpSize (),
105
- /* LaneId=*/ mapping::getThreadIdInBlock () %
106
- mapping::getWarpSize ());
107
-
108
- // When we have more than [mapping::getWarpSize()] number of threads
109
- // a block reduction is performed here.
110
- //
111
- // Only L1 parallel region can enter this if condition.
112
- if (NumThreads > mapping::getWarpSize ()) {
113
- // Gather all the reduced values from each warp
114
- // to the first warp.
115
- cpyFct (reduce_data, WarpsNeeded);
116
-
117
- if (WarpId == 0 )
118
- gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded,
119
- BlockThreadId);
120
- }
121
- return BlockThreadId == 0 ;
122
- #else
123
- __kmpc_impl_lanemask_t Liveness = mapping::activemask ();
124
- if (Liveness == lanes::All) // Full warp
125
- gpu_regular_warp_reduce (reduce_data, shflFct);
126
- else if (!(Liveness & (Liveness + 1 ))) // Partial warp but contiguous lanes
127
- gpu_irregular_warp_reduce (reduce_data, shflFct,
128
- /* LaneCount=*/ utils::popc (Liveness),
129
- /* LaneId=*/ mapping::getThreadIdInBlock () %
130
- mapping::getWarpSize ());
131
- else { // Dispersed lanes. Only threads in L2
132
- // parallel region may enter here; return
133
- // early.
134
- return gpu_irregular_simd_reduce (reduce_data, shflFct);
135
- }
69
+ gpu_regular_warp_reduce (reduce_data, shflFct);
136
70
137
71
// When we have more than [mapping::getWarpSize()] number of threads
138
72
// a block reduction is performed here.
139
- //
140
- // Only L1 parallel region can enter this if condition.
141
73
if (NumThreads > mapping::getWarpSize ()) {
142
- uint32_t WarpsNeeded =
143
- (NumThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
144
74
// Gather all the reduced values from each warp
145
75
// to the first warp.
146
- cpyFct (reduce_data, WarpsNeeded );
76
+ cpyFct (reduce_data, NumWarps );
147
77
148
- uint32_t WarpId = BlockThreadId / mapping::getWarpSize ();
149
- if (WarpId == 0 )
150
- gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded,
151
- BlockThreadId);
152
-
153
- return BlockThreadId == 0 ;
78
+ if (BlockThreadId < mapping::getWarpSize ())
79
+ gpu_irregular_warp_reduce (reduce_data, shflFct, NumWarps, BlockThreadId);
154
80
}
155
81
156
- // Get the OMP thread Id. This is different from BlockThreadId in the case of
157
- // an L2 parallel region.
158
- return TId == 0 ;
159
- # endif // __CUDA_ARCH__ >= 700
82
+ // In Generic and in SPMD mode block thread Id 0 is what we want.
83
+ // It's either the main thread in SPMD mode or the "acting" main thread in the
84
+ // parallel region.
85
+ return BlockThreadId == 0 ;
160
86
}
161
87
162
88
uint32_t roundToWarpsize (uint32_t s) {
@@ -173,9 +99,7 @@ extern "C" {
173
99
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2 (
174
100
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
175
101
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
176
- return nvptx_parallel_reduce_nowait (TId, num_vars, reduce_size, reduce_data,
177
- shflFct, cpyFct, mapping::isSPMDMode (),
178
- false );
102
+ return nvptx_parallel_reduce_nowait (reduce_data, shflFct, cpyFct);
179
103
}
180
104
181
105
// / Mostly like _v2 but with the builtin assumption that we have less than
0 commit comments