@@ -707,7 +707,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
707
707
; GFX9-NEXT: v_mov_b32_e32 v0, s0
708
708
; GFX9-NEXT: ds_add_u32 v0, v1
709
709
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
710
- ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
711
710
; GFX9-NEXT: .LBB5_2:
712
711
; GFX9-NEXT: s_endpgm
713
712
;
@@ -728,7 +727,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
728
727
; GFX90A-NEXT: v_mov_b32_e32 v0, s0
729
728
; GFX90A-NEXT: ds_add_u32 v0, v1
730
729
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
731
- ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
732
730
; GFX90A-NEXT: .LBB5_2:
733
731
; GFX90A-NEXT: s_endpgm
734
732
;
@@ -769,7 +767,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
769
767
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0
770
768
; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1
771
769
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
772
- ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
773
770
; GFX9-FLATSCR-NEXT: .LBB5_2:
774
771
; GFX9-FLATSCR-NEXT: s_endpgm
775
772
;
@@ -818,6 +815,193 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
818
815
ret void
819
816
}
820
817
818
+ ; from atomic_load_add.ll
819
+ ; covers s_load, ds_add_rtn (atomic with return)
820
+ ;
821
+ define amdgpu_kernel void @atomic_add_ret_local (ptr addrspace (1 ) %out , ptr addrspace (3 ) %local ) {
822
+ ; GFX9-LABEL: atomic_add_ret_local:
823
+ ; GFX9: ; %bb.0:
824
+ ; GFX9-NEXT: s_mov_b64 s[4:5], exec
825
+ ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
826
+ ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
827
+ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
828
+ ; GFX9-NEXT: ; implicit-def: $vgpr1
829
+ ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
830
+ ; GFX9-NEXT: s_cbranch_execz .LBB6_2
831
+ ; GFX9-NEXT: ; %bb.1:
832
+ ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
833
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
834
+ ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
835
+ ; GFX9-NEXT: s_mul_i32 s4, s4, 5
836
+ ; GFX9-NEXT: v_mov_b32_e32 v2, s4
837
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s6
838
+ ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2
839
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
840
+ ; GFX9-NEXT: .LBB6_2:
841
+ ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
842
+ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
843
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
844
+ ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
845
+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0
846
+ ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
847
+ ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
848
+ ; GFX9-NEXT: s_waitcnt vmcnt(0)
849
+ ; GFX9-NEXT: s_endpgm
850
+ ;
851
+ ; GFX90A-LABEL: atomic_add_ret_local:
852
+ ; GFX90A: ; %bb.0:
853
+ ; GFX90A-NEXT: s_mov_b64 s[4:5], exec
854
+ ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
855
+ ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
856
+ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
857
+ ; GFX90A-NEXT: ; implicit-def: $vgpr1
858
+ ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
859
+ ; GFX90A-NEXT: s_cbranch_execz .LBB6_2
860
+ ; GFX90A-NEXT: ; %bb.1:
861
+ ; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c
862
+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
863
+ ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
864
+ ; GFX90A-NEXT: s_mul_i32 s4, s4, 5
865
+ ; GFX90A-NEXT: v_mov_b32_e32 v2, s4
866
+ ; GFX90A-NEXT: v_mov_b32_e32 v1, s6
867
+ ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2
868
+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
869
+ ; GFX90A-NEXT: .LBB6_2:
870
+ ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
871
+ ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
872
+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
873
+ ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
874
+ ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
875
+ ; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
876
+ ; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
877
+ ; GFX90A-NEXT: s_waitcnt vmcnt(0)
878
+ ; GFX90A-NEXT: s_endpgm
879
+ ;
880
+ ; GFX10-LABEL: atomic_add_ret_local:
881
+ ; GFX10: ; %bb.0:
882
+ ; GFX10-NEXT: s_mov_b32 s3, exec_lo
883
+ ; GFX10-NEXT: ; implicit-def: $vgpr1
884
+ ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
885
+ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
886
+ ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
887
+ ; GFX10-NEXT: s_cbranch_execz .LBB6_2
888
+ ; GFX10-NEXT: ; %bb.1:
889
+ ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
890
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
891
+ ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
892
+ ; GFX10-NEXT: s_mul_i32 s3, s3, 5
893
+ ; GFX10-NEXT: v_mov_b32_e32 v2, s3
894
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s4
895
+ ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2
896
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
897
+ ; GFX10-NEXT: buffer_gl0_inv
898
+ ; GFX10-NEXT: .LBB6_2:
899
+ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
900
+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
901
+ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
902
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
903
+ ; GFX10-NEXT: v_readfirstlane_b32 s2, v1
904
+ ; GFX10-NEXT: v_mov_b32_e32 v1, 0
905
+ ; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
906
+ ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
907
+ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
908
+ ; GFX10-NEXT: s_endpgm
909
+ ;
910
+ ; GFX9-FLATSCR-LABEL: atomic_add_ret_local:
911
+ ; GFX9-FLATSCR: ; %bb.0:
912
+ ; GFX9-FLATSCR-NEXT: s_mov_b64 s[4:5], exec
913
+ ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
914
+ ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
915
+ ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
916
+ ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1
917
+ ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc
918
+ ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2
919
+ ; GFX9-FLATSCR-NEXT: ; %bb.1:
920
+ ; GFX9-FLATSCR-NEXT: s_load_dword s6, s[0:1], 0x2c
921
+ ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
922
+ ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
923
+ ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, 5
924
+ ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, s4
925
+ ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6
926
+ ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2
927
+ ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
928
+ ; GFX9-FLATSCR-NEXT: .LBB6_2:
929
+ ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
930
+ ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
931
+ ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
932
+ ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
933
+ ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
934
+ ; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
935
+ ; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
936
+ ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
937
+ ; GFX9-FLATSCR-NEXT: s_endpgm
938
+ ;
939
+ ; GFX11-LABEL: atomic_add_ret_local:
940
+ ; GFX11: ; %bb.0:
941
+ ; GFX11-NEXT: s_mov_b32 s3, exec_lo
942
+ ; GFX11-NEXT: s_mov_b32 s2, exec_lo
943
+ ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
944
+ ; GFX11-NEXT: ; implicit-def: $vgpr1
945
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
946
+ ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
947
+ ; GFX11-NEXT: s_cbranch_execz .LBB6_2
948
+ ; GFX11-NEXT: ; %bb.1:
949
+ ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
950
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
951
+ ; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3
952
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
953
+ ; GFX11-NEXT: s_mul_i32 s3, s3, 5
954
+ ; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4
955
+ ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2
956
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
957
+ ; GFX11-NEXT: buffer_gl0_inv
958
+ ; GFX11-NEXT: .LBB6_2:
959
+ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
960
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
961
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
962
+ ; GFX11-NEXT: v_readfirstlane_b32 s2, v1
963
+ ; GFX11-NEXT: v_mov_b32_e32 v1, 0
964
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
965
+ ; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
966
+ ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
967
+ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
968
+ ; GFX11-NEXT: s_endpgm
969
+ ;
970
+ ; GFX12-LABEL: atomic_add_ret_local:
971
+ ; GFX12: ; %bb.0:
972
+ ; GFX12-NEXT: s_mov_b32 s3, exec_lo
973
+ ; GFX12-NEXT: s_mov_b32 s2, exec_lo
974
+ ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
975
+ ; GFX12-NEXT: ; implicit-def: $vgpr1
976
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
977
+ ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
978
+ ; GFX12-NEXT: s_cbranch_execz .LBB6_2
979
+ ; GFX12-NEXT: ; %bb.1:
980
+ ; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x2c
981
+ ; GFX12-NEXT: s_wait_kmcnt 0x0
982
+ ; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3
983
+ ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
984
+ ; GFX12-NEXT: s_mul_i32 s3, s3, 5
985
+ ; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s4
986
+ ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2
987
+ ; GFX12-NEXT: s_wait_dscnt 0x0
988
+ ; GFX12-NEXT: global_inv scope:SCOPE_SE
989
+ ; GFX12-NEXT: .LBB6_2:
990
+ ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
991
+ ; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
992
+ ; GFX12-NEXT: s_wait_kmcnt 0x0
993
+ ; GFX12-NEXT: v_readfirstlane_b32 s2, v1
994
+ ; GFX12-NEXT: v_mov_b32_e32 v1, 0
995
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
996
+ ; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
997
+ ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
998
+ ; GFX12-NEXT: s_wait_storecnt 0x0
999
+ ; GFX12-NEXT: s_endpgm
1000
+ %val = atomicrmw volatile add ptr addrspace (3 ) %local , i32 5 seq_cst
1001
+ store i32 %val , ptr addrspace (1 ) %out
1002
+ ret void
1003
+ }
1004
+
821
1005
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add (i32 , ptr addrspace (8 ), i32 , i32 , i32 immarg)
822
1006
823
1007
; from atomic_optimizations_buffer.ll
@@ -832,7 +1016,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
832
1016
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
833
1017
; GFX9-NEXT: ; implicit-def: $vgpr1
834
1018
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
835
- ; GFX9-NEXT: s_cbranch_execz .LBB6_2
1019
+ ; GFX9-NEXT: s_cbranch_execz .LBB7_2
836
1020
; GFX9-NEXT: ; %bb.1:
837
1021
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
838
1022
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -841,7 +1025,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
841
1025
; GFX9-NEXT: v_mov_b32_e32 v1, s4
842
1026
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
843
1027
; GFX9-NEXT: s_waitcnt vmcnt(0)
844
- ; GFX9-NEXT: .LBB6_2 :
1028
+ ; GFX9-NEXT: .LBB7_2 :
845
1029
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
846
1030
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
847
1031
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -860,7 +1044,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
860
1044
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
861
1045
; GFX90A-NEXT: ; implicit-def: $vgpr1
862
1046
; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
863
- ; GFX90A-NEXT: s_cbranch_execz .LBB6_2
1047
+ ; GFX90A-NEXT: s_cbranch_execz .LBB7_2
864
1048
; GFX90A-NEXT: ; %bb.1:
865
1049
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
866
1050
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -869,7 +1053,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
869
1053
; GFX90A-NEXT: v_mov_b32_e32 v1, s4
870
1054
; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
871
1055
; GFX90A-NEXT: s_waitcnt vmcnt(0)
872
- ; GFX90A-NEXT: .LBB6_2 :
1056
+ ; GFX90A-NEXT: .LBB7_2 :
873
1057
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
874
1058
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
875
1059
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -887,7 +1071,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
887
1071
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
888
1072
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
889
1073
; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
890
- ; GFX10-NEXT: s_cbranch_execz .LBB6_2
1074
+ ; GFX10-NEXT: s_cbranch_execz .LBB7_2
891
1075
; GFX10-NEXT: ; %bb.1:
892
1076
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
893
1077
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -896,7 +1080,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
896
1080
; GFX10-NEXT: v_mov_b32_e32 v1, s3
897
1081
; GFX10-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
898
1082
; GFX10-NEXT: s_waitcnt vmcnt(0)
899
- ; GFX10-NEXT: .LBB6_2 :
1083
+ ; GFX10-NEXT: .LBB7_2 :
900
1084
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
901
1085
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
902
1086
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -916,7 +1100,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
916
1100
; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
917
1101
; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1
918
1102
; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc
919
- ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB6_2
1103
+ ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2
920
1104
; GFX9-FLATSCR-NEXT: ; %bb.1:
921
1105
; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
922
1106
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
@@ -925,7 +1109,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
925
1109
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s4
926
1110
; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
927
1111
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
928
- ; GFX9-FLATSCR-NEXT: .LBB6_2 :
1112
+ ; GFX9-FLATSCR-NEXT: .LBB7_2 :
929
1113
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
930
1114
; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
931
1115
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
@@ -944,7 +1128,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
944
1128
; GFX11-NEXT: ; implicit-def: $vgpr1
945
1129
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
946
1130
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
947
- ; GFX11-NEXT: s_cbranch_execz .LBB6_2
1131
+ ; GFX11-NEXT: s_cbranch_execz .LBB7_2
948
1132
; GFX11-NEXT: ; %bb.1:
949
1133
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
950
1134
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -954,7 +1138,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
954
1138
; GFX11-NEXT: v_mov_b32_e32 v1, s3
955
1139
; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
956
1140
; GFX11-NEXT: s_waitcnt vmcnt(0)
957
- ; GFX11-NEXT: .LBB6_2 :
1141
+ ; GFX11-NEXT: .LBB7_2 :
958
1142
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
959
1143
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
960
1144
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -974,7 +1158,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
974
1158
; GFX12-NEXT: ; implicit-def: $vgpr1
975
1159
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
976
1160
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
977
- ; GFX12-NEXT: s_cbranch_execz .LBB6_2
1161
+ ; GFX12-NEXT: s_cbranch_execz .LBB7_2
978
1162
; GFX12-NEXT: ; %bb.1:
979
1163
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
980
1164
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -984,7 +1168,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
984
1168
; GFX12-NEXT: v_mov_b32_e32 v1, s3
985
1169
; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
986
1170
; GFX12-NEXT: s_wait_loadcnt 0x0
987
- ; GFX12-NEXT: .LBB6_2 :
1171
+ ; GFX12-NEXT: .LBB7_2 :
988
1172
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
989
1173
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
990
1174
; GFX12-NEXT: s_wait_kmcnt 0x0
0 commit comments