Skip to content

Commit 4f4bf31

Browse files
author
Jun Wang
committed
Check if mem instruciton is already immediately followed by a
waitcnt instruction. If so, do not insert another waitcnt. Also add a testcase that has ds_add_rtn. Formatting change made to SIMemoryLegalizer.cpp is reverted.
1 parent 1e3c7dd commit 4f4bf31

File tree

3 files changed

+214
-24
lines changed

3 files changed

+214
-24
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2306,11 +2306,15 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
23062306
#endif
23072307

23082308
if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2309-
AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2310-
Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2311-
ScoreBrackets.simplifyWaitcnt(Wait);
2312-
Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2313-
ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2309+
Iter++;
2310+
if (!isWaitInstr(*Iter)) {
2311+
AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2312+
Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2313+
ScoreBrackets.simplifyWaitcnt(Wait);
2314+
Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2315+
ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2316+
}
2317+
Iter--;
23142318
}
23152319

23162320
LLVM_DEBUG({

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,10 +2606,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
26062606
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
26072607
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
26082608
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2609-
Changed |=
2610-
CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2611-
isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2612-
MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER);
2609+
Changed |= CC->insertWait(MI, MOI.getScope(),
2610+
MOI.getInstrAddrSpace(),
2611+
isAtomicRet(*MI) ? SIMemOp::LOAD :
2612+
SIMemOp::STORE,
2613+
MOI.getIsCrossAddressSpaceOrdering(),
2614+
Position::AFTER);
26132615
Changed |= CC->insertAcquire(MI, MOI.getScope(),
26142616
MOI.getOrderingAddrSpace(),
26152617
Position::AFTER);

llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll

Lines changed: 199 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
707707
; GFX9-NEXT: v_mov_b32_e32 v0, s0
708708
; GFX9-NEXT: ds_add_u32 v0, v1
709709
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
710-
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
711710
; GFX9-NEXT: .LBB5_2:
712711
; GFX9-NEXT: s_endpgm
713712
;
@@ -728,7 +727,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
728727
; GFX90A-NEXT: v_mov_b32_e32 v0, s0
729728
; GFX90A-NEXT: ds_add_u32 v0, v1
730729
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
731-
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
732730
; GFX90A-NEXT: .LBB5_2:
733731
; GFX90A-NEXT: s_endpgm
734732
;
@@ -769,7 +767,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
769767
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0
770768
; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1
771769
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
772-
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
773770
; GFX9-FLATSCR-NEXT: .LBB5_2:
774771
; GFX9-FLATSCR-NEXT: s_endpgm
775772
;
@@ -818,6 +815,193 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
818815
ret void
819816
}
820817

818+
; from atomic_load_add.ll
819+
; covers s_load, ds_add_rtn (atomic with return)
820+
;
821+
define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrspace(3) %local) {
822+
; GFX9-LABEL: atomic_add_ret_local:
823+
; GFX9: ; %bb.0:
824+
; GFX9-NEXT: s_mov_b64 s[4:5], exec
825+
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
826+
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
827+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
828+
; GFX9-NEXT: ; implicit-def: $vgpr1
829+
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
830+
; GFX9-NEXT: s_cbranch_execz .LBB6_2
831+
; GFX9-NEXT: ; %bb.1:
832+
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
833+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
834+
; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
835+
; GFX9-NEXT: s_mul_i32 s4, s4, 5
836+
; GFX9-NEXT: v_mov_b32_e32 v2, s4
837+
; GFX9-NEXT: v_mov_b32_e32 v1, s6
838+
; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2
839+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
840+
; GFX9-NEXT: .LBB6_2:
841+
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
842+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
843+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
844+
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
845+
; GFX9-NEXT: v_mov_b32_e32 v2, 0
846+
; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
847+
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
848+
; GFX9-NEXT: s_waitcnt vmcnt(0)
849+
; GFX9-NEXT: s_endpgm
850+
;
851+
; GFX90A-LABEL: atomic_add_ret_local:
852+
; GFX90A: ; %bb.0:
853+
; GFX90A-NEXT: s_mov_b64 s[4:5], exec
854+
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
855+
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
856+
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
857+
; GFX90A-NEXT: ; implicit-def: $vgpr1
858+
; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
859+
; GFX90A-NEXT: s_cbranch_execz .LBB6_2
860+
; GFX90A-NEXT: ; %bb.1:
861+
; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c
862+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
863+
; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
864+
; GFX90A-NEXT: s_mul_i32 s4, s4, 5
865+
; GFX90A-NEXT: v_mov_b32_e32 v2, s4
866+
; GFX90A-NEXT: v_mov_b32_e32 v1, s6
867+
; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2
868+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
869+
; GFX90A-NEXT: .LBB6_2:
870+
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
871+
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
872+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
873+
; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
874+
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
875+
; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
876+
; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
877+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
878+
; GFX90A-NEXT: s_endpgm
879+
;
880+
; GFX10-LABEL: atomic_add_ret_local:
881+
; GFX10: ; %bb.0:
882+
; GFX10-NEXT: s_mov_b32 s3, exec_lo
883+
; GFX10-NEXT: ; implicit-def: $vgpr1
884+
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
885+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
886+
; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
887+
; GFX10-NEXT: s_cbranch_execz .LBB6_2
888+
; GFX10-NEXT: ; %bb.1:
889+
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
890+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
891+
; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
892+
; GFX10-NEXT: s_mul_i32 s3, s3, 5
893+
; GFX10-NEXT: v_mov_b32_e32 v2, s3
894+
; GFX10-NEXT: v_mov_b32_e32 v1, s4
895+
; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2
896+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
897+
; GFX10-NEXT: buffer_gl0_inv
898+
; GFX10-NEXT: .LBB6_2:
899+
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
900+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
901+
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
902+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
903+
; GFX10-NEXT: v_readfirstlane_b32 s2, v1
904+
; GFX10-NEXT: v_mov_b32_e32 v1, 0
905+
; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
906+
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
907+
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
908+
; GFX10-NEXT: s_endpgm
909+
;
910+
; GFX9-FLATSCR-LABEL: atomic_add_ret_local:
911+
; GFX9-FLATSCR: ; %bb.0:
912+
; GFX9-FLATSCR-NEXT: s_mov_b64 s[4:5], exec
913+
; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
914+
; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
915+
; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
916+
; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1
917+
; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc
918+
; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2
919+
; GFX9-FLATSCR-NEXT: ; %bb.1:
920+
; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c
921+
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
922+
; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
923+
; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5
924+
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, s4
925+
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6
926+
; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2
927+
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
928+
; GFX9-FLATSCR-NEXT: .LBB6_2:
929+
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
930+
; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
931+
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
932+
; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
933+
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
934+
; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
935+
; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
936+
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
937+
; GFX9-FLATSCR-NEXT: s_endpgm
938+
;
939+
; GFX11-LABEL: atomic_add_ret_local:
940+
; GFX11: ; %bb.0:
941+
; GFX11-NEXT: s_mov_b32 s3, exec_lo
942+
; GFX11-NEXT: s_mov_b32 s2, exec_lo
943+
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
944+
; GFX11-NEXT: ; implicit-def: $vgpr1
945+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
946+
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
947+
; GFX11-NEXT: s_cbranch_execz .LBB6_2
948+
; GFX11-NEXT: ; %bb.1:
949+
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
950+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
951+
; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3
952+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
953+
; GFX11-NEXT: s_mul_i32 s3, s3, 5
954+
; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4
955+
; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2
956+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
957+
; GFX11-NEXT: buffer_gl0_inv
958+
; GFX11-NEXT: .LBB6_2:
959+
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
960+
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
961+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
962+
; GFX11-NEXT: v_readfirstlane_b32 s2, v1
963+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
964+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
965+
; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
966+
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
967+
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
968+
; GFX11-NEXT: s_endpgm
969+
;
970+
; GFX12-LABEL: atomic_add_ret_local:
971+
; GFX12: ; %bb.0:
972+
; GFX12-NEXT: s_mov_b32 s3, exec_lo
973+
; GFX12-NEXT: s_mov_b32 s2, exec_lo
974+
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
975+
; GFX12-NEXT: ; implicit-def: $vgpr1
976+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
977+
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
978+
; GFX12-NEXT: s_cbranch_execz .LBB6_2
979+
; GFX12-NEXT: ; %bb.1:
980+
; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c
981+
; GFX12-NEXT: s_wait_kmcnt 0x0
982+
; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3
983+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
984+
; GFX12-NEXT: s_mul_i32 s3, s3, 5
985+
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4
986+
; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2
987+
; GFX12-NEXT: s_wait_dscnt 0x0
988+
; GFX12-NEXT: global_inv scope:SCOPE_SE
989+
; GFX12-NEXT: .LBB6_2:
990+
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
991+
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
992+
; GFX12-NEXT: s_wait_kmcnt 0x0
993+
; GFX12-NEXT: v_readfirstlane_b32 s2, v1
994+
; GFX12-NEXT: v_mov_b32_e32 v1, 0
995+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
996+
; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
997+
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
998+
; GFX12-NEXT: s_wait_storecnt 0x0
999+
; GFX12-NEXT: s_endpgm
1000+
%val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
1001+
store i32 %val, ptr addrspace(1) %out
1002+
ret void
1003+
}
1004+
8211005
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
8221006

8231007
; from atomic_optimizations_buffer.ll
@@ -832,7 +1016,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
8321016
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8331017
; GFX9-NEXT: ; implicit-def: $vgpr1
8341018
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
835-
; GFX9-NEXT: s_cbranch_execz .LBB6_2
1019+
; GFX9-NEXT: s_cbranch_execz .LBB7_2
8361020
; GFX9-NEXT: ; %bb.1:
8371021
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
8381022
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -841,7 +1025,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
8411025
; GFX9-NEXT: v_mov_b32_e32 v1, s4
8421026
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
8431027
; GFX9-NEXT: s_waitcnt vmcnt(0)
844-
; GFX9-NEXT: .LBB6_2:
1028+
; GFX9-NEXT: .LBB7_2:
8451029
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
8461030
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
8471031
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -860,7 +1044,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
8601044
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8611045
; GFX90A-NEXT: ; implicit-def: $vgpr1
8621046
; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
863-
; GFX90A-NEXT: s_cbranch_execz .LBB6_2
1047+
; GFX90A-NEXT: s_cbranch_execz .LBB7_2
8641048
; GFX90A-NEXT: ; %bb.1:
8651049
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
8661050
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -869,7 +1053,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
8691053
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
8701054
; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
8711055
; GFX90A-NEXT: s_waitcnt vmcnt(0)
872-
; GFX90A-NEXT: .LBB6_2:
1056+
; GFX90A-NEXT: .LBB7_2:
8731057
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
8741058
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
8751059
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -887,7 +1071,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
8871071
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
8881072
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
8891073
; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
890-
; GFX10-NEXT: s_cbranch_execz .LBB6_2
1074+
; GFX10-NEXT: s_cbranch_execz .LBB7_2
8911075
; GFX10-NEXT: ; %bb.1:
8921076
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
8931077
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -896,7 +1080,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
8961080
; GFX10-NEXT: v_mov_b32_e32 v1, s3
8971081
; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
8981082
; GFX10-NEXT: s_waitcnt vmcnt(0)
899-
; GFX10-NEXT: .LBB6_2:
1083+
; GFX10-NEXT: .LBB7_2:
9001084
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
9011085
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
9021086
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -916,7 +1100,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
9161100
; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
9171101
; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1
9181102
; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc
919-
; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2
1103+
; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2
9201104
; GFX9-FLATSCR-NEXT: ; %bb.1:
9211105
; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
9221106
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
@@ -925,7 +1109,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
9251109
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4
9261110
; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
9271111
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
928-
; GFX9-FLATSCR-NEXT: .LBB6_2:
1112+
; GFX9-FLATSCR-NEXT: .LBB7_2:
9291113
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
9301114
; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
9311115
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
@@ -944,7 +1128,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
9441128
; GFX11-NEXT: ; implicit-def: $vgpr1
9451129
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
9461130
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
947-
; GFX11-NEXT: s_cbranch_execz .LBB6_2
1131+
; GFX11-NEXT: s_cbranch_execz .LBB7_2
9481132
; GFX11-NEXT: ; %bb.1:
9491133
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
9501134
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -954,7 +1138,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
9541138
; GFX11-NEXT: v_mov_b32_e32 v1, s3
9551139
; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
9561140
; GFX11-NEXT: s_waitcnt vmcnt(0)
957-
; GFX11-NEXT: .LBB6_2:
1141+
; GFX11-NEXT: .LBB7_2:
9581142
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
9591143
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
9601144
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -974,7 +1158,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
9741158
; GFX12-NEXT: ; implicit-def: $vgpr1
9751159
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
9761160
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
977-
; GFX12-NEXT: s_cbranch_execz .LBB6_2
1161+
; GFX12-NEXT: s_cbranch_execz .LBB7_2
9781162
; GFX12-NEXT: ; %bb.1:
9791163
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
9801164
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -984,7 +1168,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
9841168
; GFX12-NEXT: v_mov_b32_e32 v1, s3
9851169
; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
9861170
; GFX12-NEXT: s_wait_loadcnt 0x0
987-
; GFX12-NEXT: .LBB6_2:
1171+
; GFX12-NEXT: .LBB7_2:
9881172
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
9891173
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
9901174
; GFX12-NEXT: s_wait_kmcnt 0x0

0 commit comments

Comments
 (0)