@@ -1807,6 +1807,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
1807
1807
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1808
1808
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
1809
1809
; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2
1810
+ ; GFX12-NEXT: s_wait_alu 0xfffd
1810
1811
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1811
1812
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
1812
1813
; GFX12-NEXT: s_endpgm
@@ -1940,6 +1941,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
1940
1941
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
1941
1942
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
1942
1943
; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2
1944
+ ; GFX12-NEXT: s_wait_alu 0xfffd
1943
1945
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
1944
1946
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
1945
1947
; GFX12-NEXT: s_endpgm
@@ -2072,6 +2074,7 @@ define amdgpu_kernel void @fcmp_v2f16_le(
2072
2074
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2073
2075
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2074
2076
; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2
2077
+ ; GFX12-NEXT: s_wait_alu 0xfffd
2075
2078
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2076
2079
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2077
2080
; GFX12-NEXT: s_endpgm
@@ -2204,6 +2207,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
2204
2207
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2205
2208
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2206
2209
; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2
2210
+ ; GFX12-NEXT: s_wait_alu 0xfffd
2207
2211
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2208
2212
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2209
2213
; GFX12-NEXT: s_endpgm
@@ -2337,6 +2341,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
2337
2341
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2338
2342
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2339
2343
; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2
2344
+ ; GFX12-NEXT: s_wait_alu 0xfffd
2340
2345
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2341
2346
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2342
2347
; GFX12-NEXT: s_endpgm
@@ -2470,6 +2475,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
2470
2475
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2471
2476
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2472
2477
; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2
2478
+ ; GFX12-NEXT: s_wait_alu 0xfffd
2473
2479
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2474
2480
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2475
2481
; GFX12-NEXT: s_endpgm
@@ -2603,6 +2609,7 @@ define amdgpu_kernel void @fcmp_v2f16_o(
2603
2609
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2604
2610
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2605
2611
; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2
2612
+ ; GFX12-NEXT: s_wait_alu 0xfffd
2606
2613
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2607
2614
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2608
2615
; GFX12-NEXT: s_endpgm
@@ -2736,6 +2743,7 @@ define amdgpu_kernel void @fcmp_v2f16_u(
2736
2743
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2737
2744
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2738
2745
; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2
2746
+ ; GFX12-NEXT: s_wait_alu 0xfffd
2739
2747
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2740
2748
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2741
2749
; GFX12-NEXT: s_endpgm
@@ -2868,6 +2876,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
2868
2876
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
2869
2877
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2870
2878
; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2
2879
+ ; GFX12-NEXT: s_wait_alu 0xfffd
2871
2880
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
2872
2881
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
2873
2882
; GFX12-NEXT: s_endpgm
@@ -3000,6 +3009,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
3000
3009
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3001
3010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3002
3011
; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2
3012
+ ; GFX12-NEXT: s_wait_alu 0xfffd
3003
3013
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3004
3014
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3005
3015
; GFX12-NEXT: s_endpgm
@@ -3133,6 +3143,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
3133
3143
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3134
3144
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3135
3145
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
3146
+ ; GFX12-NEXT: s_wait_alu 0xfffd
3136
3147
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3137
3148
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3138
3149
; GFX12-NEXT: s_endpgm
@@ -3265,6 +3276,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
3265
3276
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3266
3277
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3267
3278
; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
3279
+ ; GFX12-NEXT: s_wait_alu 0xfffd
3268
3280
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3269
3281
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3270
3282
; GFX12-NEXT: s_endpgm
@@ -3397,6 +3409,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
3397
3409
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3398
3410
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3399
3411
; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2
3412
+ ; GFX12-NEXT: s_wait_alu 0xfffd
3400
3413
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3401
3414
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3402
3415
; GFX12-NEXT: s_endpgm
@@ -3529,6 +3542,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
3529
3542
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3530
3543
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
3531
3544
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2
3545
+ ; GFX12-NEXT: s_wait_alu 0xfffd
3532
3546
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3533
3547
; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
3534
3548
; GFX12-NEXT: s_endpgm
0 commit comments