Skip to content

Commit 0f7c837

Browse files
committed
[AMDGPU] Re-enable atomic optimization of uniform fadd/fsub with result
Fix various problems to do with the first active lane of the result of optimized fp atomics, as explained in the comment.
1 parent 94a067a commit 0f7c837

File tree

6 files changed

+1790
-1710
lines changed

6 files changed

+1790
-1710
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,6 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
226226

227227
bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
228228

229-
if ((Op == AtomicRMWInst::FAdd || Op == AtomicRMWInst::FSub) &&
230-
!I.use_empty()) {
231-
// Disable the uniform return value calculation using fmul because it
232-
// mishandles infinities, NaNs and signed zeros. FIXME.
233-
ValDivergent = true;
234-
}
235-
236229
// If the value operand is divergent, each lane is contributing a different
237230
// value to the atomic calculation. We can only optimize divergent values if
238231
// we have DPP available on our subtarget, and the atomic operation is 32
@@ -995,18 +988,25 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
995988
break;
996989
case AtomicRMWInst::FAdd:
997990
case AtomicRMWInst::FSub: {
998-
// FIXME: This path is currently disabled in visitAtomicRMWInst because
999-
// of problems calculating the first active lane of the result (where
1000-
// Mbcnt is 0):
1001-
// - If V is infinity or NaN we will return NaN instead of BroadcastI.
1002-
// - If BroadcastI is -0.0 and V is positive we will return +0.0 instead
1003-
// of -0.0.
1004991
LaneOffset = B.CreateFMul(V, Mbcnt);
1005992
break;
1006993
}
1007994
}
1008995
}
1009-
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
996+
Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
997+
if (isAtomicFloatingPointTy) {
998+
// For fadd/fsub the first active lane of LaneOffset should be the
999+
// identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
1000+
// is V * +0.0 which might have the wrong sign or might be nan (if V is
1001+
// inf or nan).
1002+
//
1003+
// For all floating point ops if the in-memory value was a nan then the
1004+
// binop we just built might have quieted it or changed its payload.
1005+
//
1006+
// Correct all these problems by using BroadcastI as the result in the
1007+
// first active lane.
1008+
Result = B.CreateSelect(Cond, BroadcastI, Result);
1009+
}
10101010

10111011
if (IsPixelShader) {
10121012
// Need a final PHI to reconverge to above the helper lane branch mask.

llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
222222
; GFX90A-NEXT: bb.4.Flow:
223223
; GFX90A-NEXT: successors: %bb.6(0x80000000)
224224
; GFX90A-NEXT: {{ $}}
225-
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %43, %bb.5, [[DEF]], %bb.1
225+
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %44, %bb.5, [[DEF]], %bb.1
226226
; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
227227
; GFX90A-NEXT: S_BRANCH %bb.6
228228
; GFX90A-NEXT: {{ $}}
@@ -235,9 +235,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
235235
; GFX90A-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
236236
; GFX90A-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
237237
; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY21]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
238+
; GFX90A-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
239+
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY22]], [[V_CMP_EQ_U32_e64_]], implicit $exec
238240
; GFX90A-NEXT: S_BRANCH %bb.4
239241
; GFX90A-NEXT: {{ $}}
240-
; GFX90A-NEXT: bb.6 (%ir-block.46):
242+
; GFX90A-NEXT: bb.6 (%ir-block.47):
241243
; GFX90A-NEXT: $vgpr0 = COPY [[PHI]]
242244
; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
243245
;
@@ -314,7 +316,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
314316
; GFX940-NEXT: bb.4.Flow:
315317
; GFX940-NEXT: successors: %bb.6(0x80000000)
316318
; GFX940-NEXT: {{ $}}
317-
; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1
319+
; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %43, %bb.5, [[DEF]], %bb.1
318320
; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
319321
; GFX940-NEXT: S_BRANCH %bb.6
320322
; GFX940-NEXT: {{ $}}
@@ -327,9 +329,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
327329
; GFX940-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
328330
; GFX940-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
329331
; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY21]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
332+
; GFX940-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
333+
; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY22]], [[V_CMP_EQ_U32_e64_]], implicit $exec
330334
; GFX940-NEXT: S_BRANCH %bb.4
331335
; GFX940-NEXT: {{ $}}
332-
; GFX940-NEXT: bb.6 (%ir-block.46):
336+
; GFX940-NEXT: bb.6 (%ir-block.47):
333337
; GFX940-NEXT: $vgpr0 = COPY [[PHI]]
334338
; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
335339
;
@@ -401,7 +405,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
401405
; GFX11-NEXT: bb.4.Flow:
402406
; GFX11-NEXT: successors: %bb.6(0x80000000)
403407
; GFX11-NEXT: {{ $}}
404-
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1
408+
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1
405409
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
406410
; GFX11-NEXT: S_BRANCH %bb.6
407411
; GFX11-NEXT: {{ $}}
@@ -414,9 +418,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
414418
; GFX11-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
415419
; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
416420
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
421+
; GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
422+
; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY16]], [[V_CMP_EQ_U32_e64_]], implicit $exec
417423
; GFX11-NEXT: S_BRANCH %bb.4
418424
; GFX11-NEXT: {{ $}}
419-
; GFX11-NEXT: bb.6 (%ir-block.47):
425+
; GFX11-NEXT: bb.6 (%ir-block.48):
420426
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
421427
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
422428
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic

llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -200,22 +200,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
200200
; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]]
201201
; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
202202
; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
203+
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
203204
; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
204-
; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
205+
; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
205206
; GFX90A-NEXT: S_BRANCH %bb.2
206207
; GFX90A-NEXT: {{ $}}
207208
; GFX90A-NEXT: bb.2 (%ir-block.36):
208209
; GFX90A-NEXT: successors: %bb.4(0x80000000)
209210
; GFX90A-NEXT: {{ $}}
210211
; GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
211-
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %2
212-
; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
212+
; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2
213+
; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
213214
; GFX90A-NEXT: S_BRANCH %bb.4
214215
; GFX90A-NEXT: {{ $}}
215216
; GFX90A-NEXT: bb.3.Flow:
216217
; GFX90A-NEXT: successors: %bb.5(0x80000000)
217218
; GFX90A-NEXT: {{ $}}
218-
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4
219+
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4
219220
; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
220221
; GFX90A-NEXT: S_BRANCH %bb.5
221222
; GFX90A-NEXT: {{ $}}
@@ -225,11 +226,14 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
225226
; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
226227
; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
227228
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
228-
; GFX90A-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
229-
; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
229+
; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
230+
; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
231+
; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
232+
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
233+
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
230234
; GFX90A-NEXT: S_BRANCH %bb.3
231235
; GFX90A-NEXT: {{ $}}
232-
; GFX90A-NEXT: bb.5 (%ir-block.46):
236+
; GFX90A-NEXT: bb.5 (%ir-block.47):
233237
; GFX90A-NEXT: $vgpr0 = COPY [[PHI]]
234238
; GFX90A-NEXT: SI_RETURN_TO_EPILOG $vgpr0
235239
;
@@ -278,22 +282,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
278282
; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_2]]
279283
; GFX940-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
280284
; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
285+
; GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
281286
; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
282-
; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
287+
; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
283288
; GFX940-NEXT: S_BRANCH %bb.2
284289
; GFX940-NEXT: {{ $}}
285290
; GFX940-NEXT: bb.2 (%ir-block.36):
286291
; GFX940-NEXT: successors: %bb.4(0x80000000)
287292
; GFX940-NEXT: {{ $}}
288293
; GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
289-
; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %2
290-
; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
294+
; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2
295+
; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
291296
; GFX940-NEXT: S_BRANCH %bb.4
292297
; GFX940-NEXT: {{ $}}
293298
; GFX940-NEXT: bb.3.Flow:
294299
; GFX940-NEXT: successors: %bb.5(0x80000000)
295300
; GFX940-NEXT: {{ $}}
296-
; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4
301+
; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4
297302
; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
298303
; GFX940-NEXT: S_BRANCH %bb.5
299304
; GFX940-NEXT: {{ $}}
@@ -303,11 +308,14 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
303308
; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
304309
; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
305310
; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
306-
; GFX940-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
307-
; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
311+
; GFX940-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
312+
; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
313+
; GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
314+
; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
315+
; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
308316
; GFX940-NEXT: S_BRANCH %bb.3
309317
; GFX940-NEXT: {{ $}}
310-
; GFX940-NEXT: bb.5 (%ir-block.46):
318+
; GFX940-NEXT: bb.5 (%ir-block.47):
311319
; GFX940-NEXT: $vgpr0 = COPY [[PHI]]
312320
; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
313321
;
@@ -356,22 +364,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
356364
; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_5]]
357365
; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
358366
; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
367+
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
359368
; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
360-
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
369+
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
361370
; GFX11-NEXT: S_BRANCH %bb.2
362371
; GFX11-NEXT: {{ $}}
363372
; GFX11-NEXT: bb.2 (%ir-block.36):
364373
; GFX11-NEXT: successors: %bb.4(0x80000000)
365374
; GFX11-NEXT: {{ $}}
366375
; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
367-
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %2
368-
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY5]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
376+
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %2
377+
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY6]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
369378
; GFX11-NEXT: S_BRANCH %bb.4
370379
; GFX11-NEXT: {{ $}}
371380
; GFX11-NEXT: bb.3.Flow:
372381
; GFX11-NEXT: successors: %bb.5(0x80000000)
373382
; GFX11-NEXT: {{ $}}
374-
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4
383+
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4
375384
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
376385
; GFX11-NEXT: S_BRANCH %bb.5
377386
; GFX11-NEXT: {{ $}}
@@ -381,11 +390,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
381390
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
382391
; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
383392
; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
384-
; GFX11-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
385-
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
393+
; GFX11-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
394+
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
395+
; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY5]]
396+
; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY7]], implicit $exec
386397
; GFX11-NEXT: S_BRANCH %bb.3
387398
; GFX11-NEXT: {{ $}}
388-
; GFX11-NEXT: bb.5 (%ir-block.47):
399+
; GFX11-NEXT: bb.5 (%ir-block.48):
389400
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
390401
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
391402
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic

0 commit comments

Comments
 (0)