@@ -220,17 +220,20 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
220
220
} else
221
221
lgredFct (GlobalBuffer, ModBockId, reduce_data);
222
222
223
+ // Propagate the memory writes above to the world.
224
+ fence::kernel (atomic::release);
225
+
223
226
// Increment team counter.
224
227
// This counter is incremented by all teams in the current
225
- // BUFFER_SIZE chunk.
228
+ // num_of_records chunk.
226
229
ChunkTeamCount = atomic::inc (&Cnt, num_of_records - 1u , atomic::seq_cst,
227
230
atomic::MemScopeTy::device);
228
231
}
229
- // Synchronize
232
+
233
+ // Synchronize in SPMD mode as in generic mode all but 1 threads are in the
234
+ // state machine.
230
235
if (mapping::isSPMDMode ())
231
236
synchronize::threadsAligned (atomic::acq_rel);
232
- else
233
- fence::kernel (atomic::acq_rel);
234
237
235
238
// reduce_data is global or shared so before being reduced within the
236
239
// warp we need to bring it in local memory:
@@ -257,6 +260,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
257
260
// Check if this is the very last team.
258
261
unsigned NumRecs = kmpcMin (NumTeams, uint32_t (num_of_records));
259
262
if (ChunkTeamCount == NumTeams - Bound - 1 ) {
263
+ // Ensure we see the global memory writes by other teams
264
+ fence::kernel (atomic::aquire);
265
+
260
266
//
261
267
// Last team processing.
262
268
//
0 commit comments