@@ -2479,8 +2479,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
2479
2479
; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
2480
2480
; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1
2481
2481
; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1
2482
- ; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000
2483
- ; GFX1032-NEXT: s_add_i32 s1, s1, 32
2482
+ ; GFX1032-NEXT: s_brev_b32 s1, 1
2484
2483
; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2
2485
2484
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2
2486
2485
; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1
@@ -2494,8 +2493,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
2494
2493
; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
2495
2494
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2496
2495
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
2497
- ; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
2498
- ; GFX1032-NEXT: s_min_u32 s0, s0, s1
2496
+ ; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1]
2499
2497
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
2500
2498
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
2501
2499
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -2529,10 +2527,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
2529
2527
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2530
2528
; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1
2531
2529
; GFX1064-NEXT: s_bitset1_b32 s1, 31
2532
- ; GFX1064-NEXT: s_ff1_i32_b32 s0, s0
2533
- ; GFX1064-NEXT: s_ff1_i32_b32 s1, s1
2534
- ; GFX1064-NEXT: s_add_i32 s1, s1, 32
2535
- ; GFX1064-NEXT: s_min_u32 s0, s0, s1
2530
+ ; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1]
2536
2531
; GFX1064-NEXT: s_cmp_gt_u32 s0, 9
2537
2532
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
2538
2533
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
@@ -2576,9 +2571,8 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
2576
2571
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2577
2572
; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0
2578
2573
; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0
2579
- ; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000
2574
+ ; GFX1032-NEXT: s_brev_b32 s1, 1
2580
2575
; GFX1032-NEXT: v_rcp_f32_e32 v2, v1
2581
- ; GFX1032-NEXT: s_add_i32 s1, s1, 32
2582
2576
; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0
2583
2577
; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2
2584
2578
; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2
@@ -2592,8 +2586,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
2592
2586
; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
2593
2587
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
2594
2588
; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0
2595
- ; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
2596
- ; GFX1032-NEXT: s_min_u32 s0, s0, s1
2589
+ ; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1]
2597
2590
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
2598
2591
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
2599
2592
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -2609,26 +2602,23 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
2609
2602
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0
2610
2603
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2611
2604
; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0
2612
- ; GFX1064-NEXT: v_div_scale_f32 v4, vcc, v0, s2, v0
2613
2605
; GFX1064-NEXT: v_rcp_f32_e32 v2, v1
2614
2606
; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0
2615
2607
; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2
2616
- ; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v2
2617
- ; GFX1064-NEXT: v_fma_f32 v5, -v1, v3, v4
2618
- ; GFX1064-NEXT: v_fmac_f32_e32 v3, v5, v2
2619
- ; GFX1064-NEXT: v_fma_f32 v1, -v1, v3, v4
2620
- ; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3
2608
+ ; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0
2609
+ ; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2
2610
+ ; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3
2611
+ ; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2
2612
+ ; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3
2613
+ ; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4
2621
2614
; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0
2622
2615
; GFX1064-NEXT: v_trunc_f32_e32 v1, v1
2623
2616
; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0
2624
2617
; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
2625
2618
; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1
2626
2619
; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
2627
2620
; GFX1064-NEXT: s_bitset1_b32 s1, 31
2628
- ; GFX1064-NEXT: s_ff1_i32_b32 s0, s0
2629
- ; GFX1064-NEXT: s_ff1_i32_b32 s1, s1
2630
- ; GFX1064-NEXT: s_add_i32 s1, s1, 32
2631
- ; GFX1064-NEXT: s_min_u32 s0, s0, s1
2621
+ ; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1]
2632
2622
; GFX1064-NEXT: s_cmp_gt_u32 s0, 9
2633
2623
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
2634
2624
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
0 commit comments