@@ -2281,13 +2281,14 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
2281
2281
; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2282
2282
; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1
2283
2283
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2284
- ; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
2285
- ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
2286
- ; GFX8-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2287
- ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2288
- ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], vcc
2289
- ; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
2290
- ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
2284
+ ; GFX8-NEXT: v_and_b32_e32 v8, 1, v6
2285
+ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
2286
+ ; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2287
+ ; GFX8-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2288
+ ; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[4:5]
2289
+ ; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v6, v4
2290
+ ; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc
2291
+ ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
2291
2292
; GFX8-NEXT: v_or_b32_e32 v5, v4, v7
2292
2293
; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1
2293
2294
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
@@ -2310,14 +2311,15 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
2310
2311
; GFX9-NEXT: s_waitcnt vmcnt(0)
2311
2312
; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2312
2313
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2313
- ; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
2314
- ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
2315
- ; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2314
+ ; GFX9-NEXT: v_and_b32_e32 v7, 1, v6
2315
+ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
2316
2316
; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2317
- ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
2318
- ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2317
+ ; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2319
2318
; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
2320
- ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
2319
+ ; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
2320
+ ; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc
2321
+ ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
2322
+ ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2321
2323
; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4
2322
2324
; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1
2323
2325
; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9
@@ -2335,15 +2337,16 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
2335
2337
; GFX10-NEXT: s_waitcnt vmcnt(0)
2336
2338
; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2337
2339
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2338
- ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
2339
- ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
2340
- ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
2340
+ ; GFX10-NEXT: v_and_b32_e32 v7, 1, v6
2341
+ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
2341
2342
; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
2342
- ; GFX10-NEXT: s_or_b32 s4, s4, vcc_lo
2343
- ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2343
+ ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
2344
2344
; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5
2345
- ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4
2345
+ ; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo
2346
2346
; GFX10-NEXT: s_mov_b32 s4, 0x400000
2347
+ ; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4
2348
+ ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2349
+ ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2347
2350
; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
2348
2351
; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1
2349
2352
; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
@@ -2360,23 +2363,24 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
2360
2363
; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
2361
2364
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2362
2365
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2363
- ; GFX11-NEXT: v_and_b32_e32 v6 , 1, v6
2364
- ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
2366
+ ; GFX11-NEXT: v_and_b32_e32 v7 , 1, v6
2367
+ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
2365
2368
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2366
- ; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
2367
2369
; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
2368
- ; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo
2369
- ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2370
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2370
+ ; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
2371
2371
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
2372
- ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0
2372
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2373
+ ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
2373
2374
; GFX11-NEXT: s_mov_b32 s0, 0x400000
2374
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2375
+ ; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
2376
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2377
+ ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2378
+ ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2375
2379
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
2376
2380
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
2381
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2377
2382
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
2378
2383
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v5, s0
2379
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2380
2384
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
2381
2385
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
2382
2386
; GFX11-NEXT: s_setpc_b64 s[30:31]
0 commit comments