@@ -893,6 +893,7 @@ define void @store_load_vindex_foo(i32 %idx) {
893
893
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
894
894
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
895
895
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
896
+ ; GFX12-NEXT: s_wait_storecnt 0x0
896
897
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
897
898
; GFX12-NEXT: s_wait_storecnt 0x0
898
899
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -964,6 +965,7 @@ define void @store_load_vindex_foo(i32 %idx) {
964
965
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
965
966
; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2)
966
967
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
968
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
967
969
; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
968
970
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
969
971
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -2137,6 +2139,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
2137
2139
; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
2138
2140
; GFX12-NEXT: s_wait_loadcnt 0x0
2139
2141
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
2142
+ ; GFX12-NEXT: s_wait_storecnt 0x0
2140
2143
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
2141
2144
; GFX12-NEXT: s_wait_storecnt 0x0
2142
2145
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -2221,6 +2224,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
2221
2224
; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
2222
2225
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
2223
2226
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
2227
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
2224
2228
; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
2225
2229
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
2226
2230
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -3382,6 +3386,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
3382
3386
; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
3383
3387
; GFX12-NEXT: s_wait_loadcnt 0x0
3384
3388
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
3389
+ ; GFX12-NEXT: s_wait_storecnt 0x0
3385
3390
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
3386
3391
; GFX12-NEXT: s_wait_storecnt 0x0
3387
3392
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3468,6 +3473,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
3468
3473
; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
3469
3474
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
3470
3475
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
3476
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
3471
3477
; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
3472
3478
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
3473
3479
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3714,6 +3720,7 @@ define void @store_load_large_imm_offset_foo() {
3714
3720
; GFX12-NEXT: s_wait_bvhcnt 0x0
3715
3721
; GFX12-NEXT: s_wait_kmcnt 0x0
3716
3722
; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3723
+ ; GFX12-NEXT: s_wait_storecnt 0x0
3717
3724
; GFX12-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
3718
3725
; GFX12-NEXT: s_wait_storecnt 0x0
3719
3726
; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3789,6 +3796,7 @@ define void @store_load_large_imm_offset_foo() {
3789
3796
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
3790
3797
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
3791
3798
; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3799
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
3792
3800
; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
3793
3801
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
3794
3802
; GFX12-PAL-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3998,6 +4006,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
3998
4006
; GFX12-NEXT: s_wait_bvhcnt 0x0
3999
4007
; GFX12-NEXT: s_wait_kmcnt 0x0
4000
4008
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4009
+ ; GFX12-NEXT: s_wait_storecnt 0x0
4001
4010
; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
4002
4011
; GFX12-NEXT: s_wait_storecnt 0x0
4003
4012
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4055,6 +4064,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
4055
4064
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
4056
4065
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
4057
4066
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4067
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4058
4068
; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
4059
4069
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4060
4070
; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4107,6 +4117,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
4107
4117
; GFX12-NEXT: s_wait_bvhcnt 0x0
4108
4118
; GFX12-NEXT: s_wait_kmcnt 0x0
4109
4119
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4120
+ ; GFX12-NEXT: s_wait_storecnt 0x0
4110
4121
; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
4111
4122
; GFX12-NEXT: s_wait_storecnt 0x0
4112
4123
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4164,6 +4175,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
4164
4175
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
4165
4176
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
4166
4177
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4178
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4167
4179
; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
4168
4180
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4169
4181
; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4220,6 +4232,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
4220
4232
; GFX12-NEXT: s_wait_kmcnt 0x0
4221
4233
; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4222
4234
; GFX12-NEXT: v_mov_b32_e32 v3, 3
4235
+ ; GFX12-NEXT: s_wait_storecnt 0x0
4223
4236
; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
4224
4237
; GFX12-NEXT: s_wait_storecnt 0x0
4225
4238
; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4282,6 +4295,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
4282
4295
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
4283
4296
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4284
4297
; GFX12-PAL-NEXT: v_mov_b32_e32 v3, 3
4298
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4285
4299
; GFX12-PAL-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
4286
4300
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4287
4301
; GFX12-PAL-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4340,6 +4354,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
4340
4354
; GFX12-NEXT: s_wait_kmcnt 0x0
4341
4355
; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4342
4356
; GFX12-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
4357
+ ; GFX12-NEXT: s_wait_storecnt 0x0
4343
4358
; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
4344
4359
; GFX12-NEXT: s_wait_storecnt 0x0
4345
4360
; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4405,6 +4420,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
4405
4420
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
4406
4421
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4407
4422
; GFX12-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
4423
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4408
4424
; GFX12-PAL-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
4409
4425
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4410
4426
; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4456,6 +4472,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
4456
4472
; GFX12-NEXT: s_wait_bvhcnt 0x0
4457
4473
; GFX12-NEXT: s_wait_kmcnt 0x0
4458
4474
; GFX12-NEXT: v_mov_b32_e32 v1, 1
4475
+ ; GFX12-NEXT: s_wait_storecnt 0x0
4459
4476
; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
4460
4477
; GFX12-NEXT: s_wait_storecnt 0x0
4461
4478
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4523,6 +4540,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
4523
4540
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
4524
4541
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
4525
4542
; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1
4543
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4526
4544
; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
4527
4545
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4528
4546
; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4576,6 +4594,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
4576
4594
; GFX12-NEXT: s_wait_bvhcnt 0x0
4577
4595
; GFX12-NEXT: s_wait_kmcnt 0x0
4578
4596
; GFX12-NEXT: v_mov_b32_e32 v1, 1
4597
+ ; GFX12-NEXT: s_wait_storecnt 0x0
4579
4598
; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
4580
4599
; GFX12-NEXT: s_wait_storecnt 0x0
4581
4600
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
@@ -4644,6 +4663,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
4644
4663
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
4645
4664
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
4646
4665
; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1
4666
+ ; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4647
4667
; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
4648
4668
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
4649
4669
; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
0 commit comments