@@ -178,109 +178,11 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
178
178
false );
179
179
}
180
180
181
- // / Mostly like _v2 but with the builtin assumption that we have less than
182
- // / num_of_records (by default 1024) teams.
183
- int32_t __kmpc_nvptx_teams_reduce_nowait_v3 (
184
- IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
185
- uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
186
- InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
187
- ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
188
- // Terminate all threads in non-SPMD mode except for the main thread.
189
- uint32_t ThreadId = mapping::getThreadIdInBlock ();
190
- if (mapping::isGenericMode ()) {
191
- if (!mapping::isMainThreadInGenericMode ())
192
- return 0 ;
193
- ThreadId = 0 ;
194
- }
195
-
196
- uint32_t &Cnt = state::getKernelLaunchEnvironment ().ReductionCnt ;
197
-
198
- // In non-generic mode all workers participate in the teams reduction.
199
- // In generic mode only the team main participates in the teams
200
- // reduction because the workers are waiting for parallel work.
201
- uint32_t NumThreads = omp_get_num_threads ();
202
- uint32_t TeamId = omp_get_team_num ();
203
- uint32_t NumTeams = omp_get_num_teams ();
204
- static unsigned SHARED (ChunkTeamCount);
205
-
206
- // Block progress for teams greater than the current upper
207
- // limit. We always only allow a number of teams less or equal
208
- // to the number of slots in the buffer.
209
- bool IsMain = (ThreadId == 0 );
210
-
211
- if (IsMain) {
212
- lgcpyFct (GlobalBuffer, TeamId, reduce_data);
213
-
214
- // Propagate the memory writes above to the world.
215
- fence::kernel (atomic::release);
216
-
217
- // Increment team counter.
218
- // This counter is incremented by all teams in the current
219
- // BUFFER_SIZE chunk.
220
- ChunkTeamCount = atomic::inc (&Cnt, NumTeams, atomic::acq_rel,
221
- atomic::MemScopeTy::device);
222
- }
223
-
224
- // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
225
- // state machine.
226
- if (mapping::isSPMDMode ())
227
- synchronize::threadsAligned (atomic::acq_rel);
228
-
229
- // Each thread will have a local struct containing the values to be
230
- // reduced:
231
- // 1. do reduction within each warp.
232
- // 2. do reduction across warps.
233
- // 3. write the final result to the main reduction variable
234
- // by returning 1 in the thread holding the reduction result.
235
-
236
- // Check if this is the very last team.
237
- if (ChunkTeamCount != NumTeams - 1 )
238
- return 0 ;
239
-
240
- // Last team processing.
241
- NumThreads = roundToWarpsize (kmpcMin (NumThreads, NumTeams));
242
- if (ThreadId >= NumThreads)
243
- return 0 ;
244
-
245
- // Ensure we see the global memory writes by other teams
246
- fence::kernel (atomic::aquire);
247
-
248
- // Load from buffer and reduce.
249
- glcpyFct (GlobalBuffer, ThreadId, reduce_data);
250
- for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
251
- glredFct (GlobalBuffer, i, reduce_data);
252
-
253
- // Reduce across warps to the warp main.
254
- gpu_regular_warp_reduce (reduce_data, shflFct);
255
-
256
- uint32_t ActiveThreads = kmpcMin (NumTeams, NumThreads);
257
- uint32_t WarpsNeeded =
258
- (ActiveThreads + mapping::getWarpSize () - 1 ) / mapping::getWarpSize ();
259
- // Gather all the reduced values from each warp
260
- // to the first warp.
261
- cpyFct (reduce_data, WarpsNeeded);
262
-
263
- if (mapping::getWarpIdInBlock () == 0 )
264
- gpu_irregular_warp_reduce (reduce_data, shflFct, WarpsNeeded, ThreadId);
265
-
266
- return IsMain;
267
- }
268
-
269
181
int32_t __kmpc_nvptx_teams_reduce_nowait_v2 (
270
182
IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
271
183
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
272
184
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
273
185
ListGlobalFnTy glredFct) {
274
- // The first check is a compile time constant, the second one a runtime check.
275
- // If the first one succeeds we will use the specialized version.
276
- if ((state::getKernelEnvironment ().Configuration .MaxTeams >= 0 &&
277
- state::getKernelEnvironment ().Configuration .MaxTeams <= num_of_records &&
278
- num_of_records == 1024 ) ||
279
- (omp_get_num_teams () <= num_of_records))
280
- return __kmpc_nvptx_teams_reduce_nowait_v3 (
281
- Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
282
- lgcpyFct, lgredFct, glcpyFct, glredFct);
283
-
284
186
// Terminate all threads in non-SPMD mode except for the master thread.
285
187
uint32_t ThreadId = mapping::getThreadIdInBlock ();
286
188
if (mapping::isGenericMode ()) {
0 commit comments