@@ -4618,58 +4618,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
4618
4618
; GFX12-NEXT: s_wait_samplecnt 0x0
4619
4619
; GFX12-NEXT: s_wait_bvhcnt 0x0
4620
4620
; GFX12-NEXT: s_wait_kmcnt 0x0
4621
- ; GFX12-NEXT: v_mov_b32_e32 v2, v0
4622
- ; GFX12-NEXT: v_mov_b32_e32 v0, s4
4623
- ; GFX12-NEXT: s_addk_co_i32 s4, 0x400
4624
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4625
- ; GFX12-NEXT: v_mov_b32_e32 v3, s4
4626
- ; GFX12-NEXT: s_mov_b32 s4, 0
4627
- ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
4628
- ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
4629
- ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
4630
- ; GFX12-NEXT: s_wait_loadcnt 0x0
4631
- ; GFX12-NEXT: v_mov_b32_e32 v5, v0
4621
+ ; GFX12-NEXT: v_mov_b32_e32 v1, s4
4632
4622
; GFX12-NEXT: s_wait_storecnt 0x0
4633
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4634
- ; GFX12-NEXT: v_pk_add_f16 v4, v5, v2
4635
- ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
4636
- ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
4623
+ ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
4637
4624
; GFX12-NEXT: s_wait_loadcnt 0x0
4638
4625
; GFX12-NEXT: global_inv scope:SCOPE_DEV
4639
- ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
4640
- ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
4641
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4642
- ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4643
- ; GFX12-NEXT: s_cbranch_execnz .LBB12_1
4644
- ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
4645
- ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
4646
4626
; GFX12-NEXT: s_setpc_b64 s[30:31]
4647
4627
;
4648
4628
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
4649
4629
; GFX940: ; %bb.0:
4650
4630
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4651
- ; GFX940-NEXT: v_mov_b32_e32 v2, v0
4652
- ; GFX940-NEXT: v_mov_b32_e32 v0, s4
4653
- ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
4654
- ; GFX940-NEXT: s_add_i32 s6, s4, 0x400
4655
- ; GFX940-NEXT: s_mov_b64 s[4:5], 0
4656
- ; GFX940-NEXT: v_mov_b32_e32 v3, s6
4657
- ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
4658
- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
4659
- ; GFX940-NEXT: s_waitcnt vmcnt(0)
4660
- ; GFX940-NEXT: v_mov_b32_e32 v5, v0
4661
- ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
4631
+ ; GFX940-NEXT: v_mov_b32_e32 v1, s4
4662
4632
; GFX940-NEXT: buffer_wbl2 sc1
4663
- ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
4664
- ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
4633
+ ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
4665
4634
; GFX940-NEXT: s_waitcnt vmcnt(0)
4666
4635
; GFX940-NEXT: buffer_inv sc1
4667
- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4668
- ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4669
- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
4670
- ; GFX940-NEXT: s_cbranch_execnz .LBB12_1
4671
- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
4672
- ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
4673
4636
; GFX940-NEXT: s_setpc_b64 s[30:31]
4674
4637
;
4675
4638
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
@@ -4735,27 +4698,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
4735
4698
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
4736
4699
; GFX90A: ; %bb.0:
4737
4700
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4738
- ; GFX90A-NEXT: v_mov_b32_e32 v2, v0
4739
- ; GFX90A-NEXT: v_mov_b32_e32 v0, s8
4740
- ; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
4741
- ; GFX90A-NEXT: s_add_i32 s10, s8, 0x400
4742
- ; GFX90A-NEXT: s_mov_b64 s[8:9], 0
4743
- ; GFX90A-NEXT: v_mov_b32_e32 v3, s10
4744
- ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
4745
- ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
4746
- ; GFX90A-NEXT: s_waitcnt vmcnt(0)
4747
- ; GFX90A-NEXT: v_mov_b32_e32 v5, v0
4748
- ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
4749
- ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
4750
- ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
4701
+ ; GFX90A-NEXT: v_mov_b32_e32 v1, s8
4702
+ ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
4751
4703
; GFX90A-NEXT: s_waitcnt vmcnt(0)
4752
4704
; GFX90A-NEXT: buffer_wbinvl1
4753
- ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4754
- ; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4755
- ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
4756
- ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
4757
- ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
4758
- ; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
4759
4705
; GFX90A-NEXT: s_setpc_b64 s[30:31]
4760
4706
;
4761
4707
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
@@ -4921,56 +4867,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
4921
4867
; GFX12-NEXT: s_wait_bvhcnt 0x0
4922
4868
; GFX12-NEXT: s_wait_kmcnt 0x0
4923
4869
; GFX12-NEXT: v_mov_b32_e32 v1, s4
4924
- ; GFX12-NEXT: s_addk_co_i32 s4, 0x400
4925
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4926
- ; GFX12-NEXT: v_mov_b32_e32 v3, s4
4927
- ; GFX12-NEXT: s_mov_b32 s4, 0
4928
- ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
4929
- ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
4930
- ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
4931
- ; GFX12-NEXT: s_wait_loadcnt 0x0
4932
- ; GFX12-NEXT: v_pk_add_f16 v1, v2, v0
4933
- ; GFX12-NEXT: v_mov_b32_e32 v5, v2
4934
4870
; GFX12-NEXT: s_wait_storecnt 0x0
4935
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
4936
- ; GFX12-NEXT: v_mov_b32_e32 v4, v1
4937
- ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
4938
- ; GFX12-NEXT: s_wait_loadcnt 0x0
4871
+ ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
4872
+ ; GFX12-NEXT: s_wait_storecnt 0x0
4939
4873
; GFX12-NEXT: global_inv scope:SCOPE_DEV
4940
- ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
4941
- ; GFX12-NEXT: v_mov_b32_e32 v2, v4
4942
- ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
4943
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4944
- ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4945
- ; GFX12-NEXT: s_cbranch_execnz .LBB13_1
4946
- ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
4947
- ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
4948
4874
; GFX12-NEXT: s_setpc_b64 s[30:31]
4949
4875
;
4950
4876
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
4951
4877
; GFX940: ; %bb.0:
4952
4878
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4953
4879
; GFX940-NEXT: v_mov_b32_e32 v1, s4
4954
- ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
4955
- ; GFX940-NEXT: s_add_i32 s6, s4, 0x400
4956
- ; GFX940-NEXT: s_mov_b64 s[4:5], 0
4957
- ; GFX940-NEXT: v_mov_b32_e32 v1, s6
4958
- ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
4959
- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
4960
- ; GFX940-NEXT: s_waitcnt vmcnt(0)
4961
- ; GFX940-NEXT: v_pk_add_f16 v2, v3, v0
4962
4880
; GFX940-NEXT: buffer_wbl2 sc1
4963
- ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
4964
- ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
4881
+ ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
4965
4882
; GFX940-NEXT: s_waitcnt vmcnt(0)
4966
4883
; GFX940-NEXT: buffer_inv sc1
4967
- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
4968
- ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4969
- ; GFX940-NEXT: v_mov_b32_e32 v3, v4
4970
- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
4971
- ; GFX940-NEXT: s_cbranch_execnz .LBB13_1
4972
- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
4973
- ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
4974
4884
; GFX940-NEXT: s_setpc_b64 s[30:31]
4975
4885
;
4976
4886
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
@@ -5036,25 +4946,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
5036
4946
; GFX90A: ; %bb.0:
5037
4947
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5038
4948
; GFX90A-NEXT: v_mov_b32_e32 v1, s8
5039
- ; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen offset:1024
5040
- ; GFX90A-NEXT: s_add_i32 s10, s8, 0x400
5041
- ; GFX90A-NEXT: s_mov_b64 s[8:9], 0
5042
- ; GFX90A-NEXT: v_mov_b32_e32 v1, s10
5043
- ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
5044
- ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
5045
- ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5046
- ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
5047
- ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
5048
- ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
4949
+ ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
5049
4950
; GFX90A-NEXT: s_waitcnt vmcnt(0)
5050
4951
; GFX90A-NEXT: buffer_wbinvl1
5051
- ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
5052
- ; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5053
- ; GFX90A-NEXT: v_mov_b32_e32 v3, v4
5054
- ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
5055
- ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
5056
- ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
5057
- ; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
5058
4952
; GFX90A-NEXT: s_setpc_b64 s[30:31]
5059
4953
;
5060
4954
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
@@ -5217,8 +5111,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
5217
5111
; GFX12-NEXT: s_wait_samplecnt 0x0
5218
5112
; GFX12-NEXT: s_wait_bvhcnt 0x0
5219
5113
; GFX12-NEXT: s_wait_kmcnt 0x0
5220
- ; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
5221
5114
; GFX12-NEXT: s_mov_b32 s1, exec_lo
5115
+ ; GFX12-NEXT: s_wait_storecnt 0x0
5222
5116
; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5223
5117
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
5224
5118
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -5230,59 +5124,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
5230
5124
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5231
5125
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
5232
5126
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5233
- ; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
5127
+ ; GFX12-NEXT: s_wait_loadcnt 0x0
5128
+ ; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
5129
+ ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
5234
5130
; GFX12-NEXT: ; implicit-def: $vgpr4
5235
5131
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
5236
5132
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
5237
5133
; GFX12-NEXT: ; %bb.2:
5238
5134
; GFX12-NEXT: s_mov_b32 exec_lo, s1
5239
- ; GFX12-NEXT: s_mov_b32 s1, 0
5240
- ; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start
5241
- ; GFX12-NEXT: ; =>This Loop Header: Depth=1
5242
- ; GFX12-NEXT: ; Child Loop BB14_4 Depth 2
5243
- ; GFX12-NEXT: s_wait_loadcnt 0x0
5244
- ; GFX12-NEXT: v_pk_add_f16 v7, v8, v5
5245
- ; GFX12-NEXT: s_mov_b32 s2, exec_lo
5246
- ; GFX12-NEXT: s_wait_storecnt 0x0
5247
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
5248
- ; GFX12-NEXT: v_mov_b32_e32 v6, v7
5249
- ; GFX12-NEXT: v_mov_b32_e32 v7, v8
5250
- ; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5251
- ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
5252
- ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
5253
- ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
5254
- ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
5255
- ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
5256
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5257
- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5258
- ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5259
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5260
- ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
5261
- ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5262
- ; GFX12-NEXT: s_wait_loadcnt 0x0
5263
- ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
5264
- ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
5265
- ; GFX12-NEXT: s_cbranch_execnz .LBB14_4
5266
- ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5267
- ; GFX12-NEXT: s_mov_b32 exec_lo, s2
5268
5135
; GFX12-NEXT: s_wait_loadcnt 0x0
5269
- ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
5270
- ; GFX12-NEXT: v_mov_b32_e32 v8, v6
5136
+ ; GFX12-NEXT: v_mov_b32_e32 v0, v5
5271
5137
; GFX12-NEXT: global_inv scope:SCOPE_DEV
5272
- ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
5273
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5274
- ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
5275
- ; GFX12-NEXT: s_cbranch_execnz .LBB14_3
5276
- ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
5277
- ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
5278
- ; GFX12-NEXT: v_mov_b32_e32 v0, v6
5279
5138
; GFX12-NEXT: s_setpc_b64 s[30:31]
5280
5139
;
5281
5140
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
5282
5141
; GFX940: ; %bb.0:
5283
5142
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5284
- ; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
5285
5143
; GFX940-NEXT: s_mov_b64 s[2:3], exec
5144
+ ; GFX940-NEXT: buffer_wbl2 sc1
5286
5145
; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5287
5146
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
5288
5147
; GFX940-NEXT: v_readfirstlane_b32 s5, v1
@@ -5293,48 +5152,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
5293
5152
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
5294
5153
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
5295
5154
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
5296
- ; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
5155
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
5156
+ ; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
5157
+ ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
5297
5158
; GFX940-NEXT: ; implicit-def: $vgpr4
5298
5159
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
5299
5160
; GFX940-NEXT: s_cbranch_execnz .LBB14_1
5300
5161
; GFX940-NEXT: ; %bb.2:
5301
5162
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
5302
- ; GFX940-NEXT: s_mov_b64 s[2:3], 0
5303
- ; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start
5304
- ; GFX940-NEXT: ; =>This Loop Header: Depth=1
5305
- ; GFX940-NEXT: ; Child Loop BB14_4 Depth 2
5306
- ; GFX940-NEXT: s_waitcnt vmcnt(0)
5307
- ; GFX940-NEXT: v_pk_add_f16 v8, v9, v5
5308
- ; GFX940-NEXT: s_mov_b64 s[8:9], exec
5309
- ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
5310
- ; GFX940-NEXT: buffer_wbl2 sc1
5311
- ; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5312
- ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
5313
- ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
5314
- ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
5315
- ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
5316
- ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
5317
- ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
5318
- ; GFX940-NEXT: s_nop 0
5319
- ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
5320
- ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
5321
- ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
5322
- ; GFX940-NEXT: s_waitcnt vmcnt(0)
5323
- ; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
5324
- ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
5325
- ; GFX940-NEXT: s_cbranch_execnz .LBB14_4
5326
- ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5327
- ; GFX940-NEXT: s_mov_b64 exec, s[8:9]
5328
5163
; GFX940-NEXT: s_waitcnt vmcnt(0)
5329
- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
5330
- ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5331
- ; GFX940-NEXT: v_mov_b32_e32 v9, v6
5164
+ ; GFX940-NEXT: v_mov_b32_e32 v0, v5
5332
5165
; GFX940-NEXT: buffer_inv sc1
5333
- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
5334
- ; GFX940-NEXT: s_cbranch_execnz .LBB14_3
5335
- ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
5336
- ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
5337
- ; GFX940-NEXT: v_mov_b32_e32 v0, v6
5338
5166
; GFX940-NEXT: s_setpc_b64 s[30:31]
5339
5167
;
5340
5168
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
@@ -5468,7 +5296,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
5468
5296
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
5469
5297
; GFX90A: ; %bb.0:
5470
5298
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5471
- ; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
5472
5299
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
5473
5300
; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5474
5301
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -5479,47 +5306,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
5479
5306
; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5480
5307
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5481
5308
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5482
- ; GFX90A-NEXT: s_nop 0
5483
- ; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
5309
+ ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5310
+ ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
5311
+ ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
5484
5312
; GFX90A-NEXT: ; implicit-def: $vgpr4
5485
5313
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
5486
5314
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
5487
5315
; GFX90A-NEXT: ; %bb.2:
5488
5316
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
5489
- ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
5490
- ; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start
5491
- ; GFX90A-NEXT: ; =>This Loop Header: Depth=1
5492
- ; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2
5493
5317
; GFX90A-NEXT: s_waitcnt vmcnt(0)
5494
- ; GFX90A-NEXT: v_pk_add_f16 v8, v9, v5
5495
- ; GFX90A-NEXT: s_mov_b64 s[12:13], exec
5496
- ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
5497
- ; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5498
- ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
5499
- ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
5500
- ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
5501
- ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
5502
- ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
5503
- ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5504
- ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5505
- ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5506
- ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5507
- ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5508
- ; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
5509
- ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
5510
- ; GFX90A-NEXT: s_cbranch_execnz .LBB14_4
5511
- ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5512
- ; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
5513
- ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5514
- ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
5515
- ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5516
- ; GFX90A-NEXT: v_mov_b32_e32 v9, v6
5318
+ ; GFX90A-NEXT: v_mov_b32_e32 v0, v5
5517
5319
; GFX90A-NEXT: buffer_wbinvl1
5518
- ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
5519
- ; GFX90A-NEXT: s_cbranch_execnz .LBB14_3
5520
- ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
5521
- ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
5522
- ; GFX90A-NEXT: v_mov_b32_e32 v0, v6
5523
5320
; GFX90A-NEXT: s_setpc_b64 s[30:31]
5524
5321
;
5525
5322
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
0 commit comments