Skip to content

Commit ae63db7

Browse files
authored
[AMDGPU] Re-enable atomic optimization of uniform fadd/fsub with result (#97604)
Fix various problems to do with the first active lane of the result of optimized fp atomics, as explained in the comment. Fixes #97554
1 parent 3b7a7f4 commit ae63db7

File tree

6 files changed

+1684
-1618
lines changed

6 files changed

+1684
-1618
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -226,13 +226,6 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
226226

227227
bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
228228

229-
if ((Op == AtomicRMWInst::FAdd || Op == AtomicRMWInst::FSub) &&
230-
!I.use_empty()) {
231-
// Disable the uniform return value calculation using fmul because it
232-
// mishandles infinities, NaNs and signed zeros. FIXME.
233-
ValDivergent = true;
234-
}
235-
236229
// If the value operand is divergent, each lane is contributing a different
237230
// value to the atomic calculation. We can only optimize divergent values if
238231
// we have DPP available on our subtarget, and the atomic operation is 32
@@ -937,18 +930,25 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
937930
break;
938931
case AtomicRMWInst::FAdd:
939932
case AtomicRMWInst::FSub: {
940-
// FIXME: This path is currently disabled in visitAtomicRMWInst because
941-
// of problems calculating the first active lane of the result (where
942-
// Mbcnt is 0):
943-
// - If V is infinity or NaN we will return NaN instead of BroadcastI.
944-
// - If BroadcastI is -0.0 and V is positive we will return +0.0 instead
945-
// of -0.0.
946933
LaneOffset = B.CreateFMul(V, Mbcnt);
947934
break;
948935
}
949936
}
950937
}
951-
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
938+
Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
939+
if (isAtomicFloatingPointTy) {
940+
// For fadd/fsub the first active lane of LaneOffset should be the
941+
// identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
942+
// is V * +0.0 which might have the wrong sign or might be nan (if V is
943+
// inf or nan).
944+
//
945+
// For all floating point ops if the in-memory value was a nan then the
946+
// binop we just built might have quieted it or changed its payload.
947+
//
948+
// Correct all these problems by using BroadcastI as the result in the
949+
// first active lane.
950+
Result = B.CreateSelect(Cond, BroadcastI, Result);
951+
}
952952

953953
if (IsPixelShader) {
954954
// Need a final PHI to reconverge to above the helper lane branch mask.

llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
221221
; GFX90A-NEXT: bb.4.Flow:
222222
; GFX90A-NEXT: successors: %bb.6(0x80000000)
223223
; GFX90A-NEXT: {{ $}}
224-
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1
224+
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %43, %bb.5, [[DEF]], %bb.1
225225
; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
226226
; GFX90A-NEXT: S_BRANCH %bb.6
227227
; GFX90A-NEXT: {{ $}}
@@ -234,9 +234,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
234234
; GFX90A-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
235235
; GFX90A-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
236236
; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY21]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
237+
; GFX90A-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
238+
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY22]], [[V_CMP_EQ_U32_e64_]], implicit $exec
237239
; GFX90A-NEXT: S_BRANCH %bb.4
238240
; GFX90A-NEXT: {{ $}}
239-
; GFX90A-NEXT: bb.6 (%ir-block.40):
241+
; GFX90A-NEXT: bb.6 (%ir-block.41):
240242
; GFX90A-NEXT: $vgpr0 = COPY [[PHI]]
241243
; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
242244
;
@@ -312,7 +314,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
312314
; GFX940-NEXT: bb.4.Flow:
313315
; GFX940-NEXT: successors: %bb.6(0x80000000)
314316
; GFX940-NEXT: {{ $}}
315-
; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1
317+
; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %42, %bb.5, [[DEF]], %bb.1
316318
; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
317319
; GFX940-NEXT: S_BRANCH %bb.6
318320
; GFX940-NEXT: {{ $}}
@@ -325,9 +327,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
325327
; GFX940-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
326328
; GFX940-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
327329
; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY21]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
330+
; GFX940-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
331+
; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_6]], 0, [[COPY22]], [[V_CMP_EQ_U32_e64_]], implicit $exec
328332
; GFX940-NEXT: S_BRANCH %bb.4
329333
; GFX940-NEXT: {{ $}}
330-
; GFX940-NEXT: bb.6 (%ir-block.40):
334+
; GFX940-NEXT: bb.6 (%ir-block.41):
331335
; GFX940-NEXT: $vgpr0 = COPY [[PHI]]
332336
; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
333337
;
@@ -398,7 +402,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
398402
; GFX11-NEXT: bb.4.Flow:
399403
; GFX11-NEXT: successors: %bb.6(0x80000000)
400404
; GFX11-NEXT: {{ $}}
401-
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %40, %bb.5, [[DEF]], %bb.1
405+
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI %41, %bb.5, [[DEF]], %bb.1
402406
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
403407
; GFX11-NEXT: S_BRANCH %bb.6
404408
; GFX11-NEXT: {{ $}}
@@ -411,9 +415,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
411415
; GFX11-NEXT: [[STRICT_WWM1:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
412416
; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
413417
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY15]], 0, [[STRICT_WWM1]], 0, 0, implicit $mode, implicit $exec
418+
; GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
419+
; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_ADD_F32_e64_5]], 0, [[COPY16]], [[V_CMP_EQ_U32_e64_]], implicit $exec
414420
; GFX11-NEXT: S_BRANCH %bb.4
415421
; GFX11-NEXT: {{ $}}
416-
; GFX11-NEXT: bb.6 (%ir-block.37):
422+
; GFX11-NEXT: bb.6 (%ir-block.38):
417423
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
418424
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
419425
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic

llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -199,22 +199,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
199199
; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]]
200200
; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
201201
; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
202+
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
202203
; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
203-
; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
204+
; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
204205
; GFX90A-NEXT: S_BRANCH %bb.2
205206
; GFX90A-NEXT: {{ $}}
206207
; GFX90A-NEXT: bb.2 (%ir-block.32):
207208
; GFX90A-NEXT: successors: %bb.4(0x80000000)
208209
; GFX90A-NEXT: {{ $}}
209210
; GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
210-
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %2
211-
; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
211+
; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2
212+
; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
212213
; GFX90A-NEXT: S_BRANCH %bb.4
213214
; GFX90A-NEXT: {{ $}}
214215
; GFX90A-NEXT: bb.3.Flow:
215216
; GFX90A-NEXT: successors: %bb.5(0x80000000)
216217
; GFX90A-NEXT: {{ $}}
217-
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4
218+
; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4
218219
; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
219220
; GFX90A-NEXT: S_BRANCH %bb.5
220221
; GFX90A-NEXT: {{ $}}
@@ -224,11 +225,14 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
224225
; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
225226
; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
226227
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
227-
; GFX90A-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
228-
; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
228+
; GFX90A-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
229+
; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
230+
; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
231+
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
232+
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
229233
; GFX90A-NEXT: S_BRANCH %bb.3
230234
; GFX90A-NEXT: {{ $}}
231-
; GFX90A-NEXT: bb.5 (%ir-block.40):
235+
; GFX90A-NEXT: bb.5 (%ir-block.41):
232236
; GFX90A-NEXT: $vgpr0 = COPY [[PHI]]
233237
; GFX90A-NEXT: SI_RETURN_TO_EPILOG $vgpr0
234238
;
@@ -276,22 +280,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
276280
; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]]
277281
; GFX940-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
278282
; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
283+
; GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
279284
; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
280-
; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
285+
; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
281286
; GFX940-NEXT: S_BRANCH %bb.2
282287
; GFX940-NEXT: {{ $}}
283288
; GFX940-NEXT: bb.2 (%ir-block.32):
284289
; GFX940-NEXT: successors: %bb.4(0x80000000)
285290
; GFX940-NEXT: {{ $}}
286291
; GFX940-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
287-
; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %2
288-
; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
292+
; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2
293+
; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
289294
; GFX940-NEXT: S_BRANCH %bb.4
290295
; GFX940-NEXT: {{ $}}
291296
; GFX940-NEXT: bb.3.Flow:
292297
; GFX940-NEXT: successors: %bb.5(0x80000000)
293298
; GFX940-NEXT: {{ $}}
294-
; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4
299+
; GFX940-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4
295300
; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
296301
; GFX940-NEXT: S_BRANCH %bb.5
297302
; GFX940-NEXT: {{ $}}
@@ -301,11 +306,14 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
301306
; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
302307
; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
303308
; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
304-
; GFX940-NEXT: early-clobber %43:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
305-
; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %43, 0, 0, implicit $mode, implicit $exec
309+
; GFX940-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
310+
; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
311+
; GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
312+
; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
313+
; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
306314
; GFX940-NEXT: S_BRANCH %bb.3
307315
; GFX940-NEXT: {{ $}}
308-
; GFX940-NEXT: bb.5 (%ir-block.40):
316+
; GFX940-NEXT: bb.5 (%ir-block.41):
309317
; GFX940-NEXT: $vgpr0 = COPY [[PHI]]
310318
; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0
311319
;
@@ -353,22 +361,23 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
353361
; GFX11-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_4]], killed [[S_MOV_B32_4]]
354362
; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
355363
; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
364+
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
356365
; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
357-
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
366+
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
358367
; GFX11-NEXT: S_BRANCH %bb.2
359368
; GFX11-NEXT: {{ $}}
360369
; GFX11-NEXT: bb.2 (%ir-block.29):
361370
; GFX11-NEXT: successors: %bb.4(0x80000000)
362371
; GFX11-NEXT: {{ $}}
363372
; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
364-
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %2
365-
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY5]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
373+
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %2
374+
; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY6]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
366375
; GFX11-NEXT: S_BRANCH %bb.4
367376
; GFX11-NEXT: {{ $}}
368377
; GFX11-NEXT: bb.3.Flow:
369378
; GFX11-NEXT: successors: %bb.5(0x80000000)
370379
; GFX11-NEXT: {{ $}}
371-
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %7, %bb.4
380+
; GFX11-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4
372381
; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
373382
; GFX11-NEXT: S_BRANCH %bb.5
374383
; GFX11-NEXT: {{ $}}
@@ -378,11 +387,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
378387
; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
379388
; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
380389
; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
381-
; GFX11-NEXT: early-clobber %43:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
382-
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_READFIRSTLANE_B32_]], 0, killed %43, 0, 0, implicit $mode, implicit $exec
390+
; GFX11-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
391+
; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
392+
; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY5]]
393+
; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY7]], implicit $exec
383394
; GFX11-NEXT: S_BRANCH %bb.3
384395
; GFX11-NEXT: {{ $}}
385-
; GFX11-NEXT: bb.5 (%ir-block.37):
396+
; GFX11-NEXT: bb.5 (%ir-block.38):
386397
; GFX11-NEXT: $vgpr0 = COPY [[PHI]]
387398
; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0
388399
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic

0 commit comments

Comments
 (0)