@@ -175,11 +175,116 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc,
175
175
return nvptx_parallel_reduce_nowait (reduce_data, shflFct, cpyFct);
176
176
}
177
177
178
+ // / Mostly like _v2 but with the builtin assumption that we have less than
179
+ // / num_of_records (by default 1024) teams.
180
+ int32_t __kmpc_nvptx_teams_reduce_nowait_v3 (
181
+ IdentTy *Loc, void *__restrict__ GlobalBuffer, uint32_t num_of_records,
182
+ void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
183
+ ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
184
+ ListGlobalFnTy glredFct) {
185
+ // Terminate all threads in non-SPMD mode except for the main thread.
186
+ uint32_t ThreadId = mapping::getThreadIdInBlock ();
187
+ if (mapping::isGenericMode ()) {
188
+ if (!mapping::isMainThreadInGenericMode ())
189
+ return 0 ;
190
+ ThreadId = 0 ;
191
+ }
192
+
193
+ uint32_t &Cnt = state::getKernelLaunchEnvironment ().ReductionCnt ;
194
+
195
+ // In non-generic mode all workers participate in the teams reduction.
196
+ // In generic mode only the team main participates in the teams
197
+ // reduction because the workers are waiting for parallel work.
198
+ uint32_t NumThreads = omp_get_num_threads ();
199
+ uint32_t TeamId = omp_get_team_num ();
200
+ uint32_t NumTeams = omp_get_num_teams ();
201
+ static unsigned SHARED (ChunkTeamCount);
202
+
203
+ // Block progress for teams greater than the current upper
204
+ // limit. We always only allow a number of teams less or equal
205
+ // to the number of slots in the buffer.
206
+ bool IsMain = (ThreadId == 0 );
207
+
208
+ if (IsMain) {
209
+ lgcpyFct (GlobalBuffer, TeamId, reduce_data);
210
+
211
+ // Propagate the memory writes above to the world.
212
+ fence::kernel (atomic::release);
213
+
214
+ // Increment team counter.
215
+ // This counter is incremented by all teams in the current
216
+ // BUFFER_SIZE chunk.
217
+ ChunkTeamCount = atomic::inc (&Cnt, NumTeams - 1 , atomic::seq_cst,
218
+ atomic::MemScopeTy::device);
219
+ }
220
+
221
+ // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
222
+ // state machine.
223
+ if (mapping::isSPMDMode ())
224
+ synchronize::threadsAligned (atomic::acq_rel);
225
+
226
+ // Each thread will have a local struct containing the values to be
227
+ // reduced:
228
+ // 1. do reduction within each warp.
229
+ // 2. do reduction across warps.
230
+ // 3. write the final result to the main reduction variable
231
+ // by returning 1 in the thread holding the reduction result.
232
+
233
+ // Check if this is the very last team.
234
+ if (ChunkTeamCount != NumTeams - 1 )
235
+ return 0 ;
236
+
237
+ if (ThreadId >= NumTeams)
238
+ return 0 ;
239
+
240
+ // Last team processing.
241
+ NumThreads = roundToWarpsize (kmpcMin (NumThreads, NumTeams));
242
+ if (ThreadId >= NumThreads)
243
+ return 0 ;
244
+
245
+ // Ensure we see the global memory writes by other teams
246
+ fence::kernel (atomic::aquire);
247
+
248
+ // Load from buffer and reduce.
249
+ glcpyFct (GlobalBuffer, ThreadId, reduce_data);
250
+ for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
251
+ glredFct (GlobalBuffer, i, reduce_data);
252
+
253
+ // Reduce across warps to the warp main.
254
+ if (NumThreads > 1 )
255
+ gpu_regular_warp_reduce (reduce_data, shflFct);
256
+
257
+ uint32_t ActiveThreads = kmpcMin (NumTeams, NumThreads);
258
+ uint32_t WarpsNeeded =
259
+ (ActiveThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
260
+ if (ActiveThreads > mapping::getWarpSize ()) {
261
+ // Gather all the reduced values from each warp
262
+ // to the first warp.
263
+ cpyFct (reduce_data, WarpsNeeded);
264
+
265
+ if (mapping::getWarpIdInBlock () == 0 )
266
+ gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded, ThreadId);
267
+ }
268
+
269
+ return IsMain;
270
+ }
271
+
178
272
int32_t __kmpc_nvptx_teams_reduce_nowait_v2 (
179
273
IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records,
180
274
uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct,
181
275
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
182
276
ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
277
+
278
+ // The first check is a compile time constant, the second one a runtime check.
279
+ // If the first one succeeds we will use the specialized version.
280
+ if ((state::getKernelEnvironment ().Configuration .MaxTeams >= 0 &&
281
+ state::getKernelEnvironment ().Configuration .MaxTeams <= num_of_records &&
282
+ num_of_records == 1024 ) ||
283
+ (omp_get_num_teams () <= num_of_records))
284
+ return __kmpc_nvptx_teams_reduce_nowait_v3 (
285
+ Loc, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
286
+ lgcpyFct, lgredFct, glcpyFct, glredFct);
287
+
183
288
// Terminate all threads in non-SPMD mode except for the master thread.
184
289
uint32_t ThreadId = mapping::getThreadIdInBlock ();
185
290
if (mapping::isGenericMode ()) {
0 commit comments