@@ -421,38 +421,38 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
421
421
; GFX7: ; %bb.0:
422
422
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
423
423
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
424
+ ; GFX7-NEXT: v_mov_b32_e32 v7, v1
424
425
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
425
- ; GFX7-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc
426
- ; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
427
- ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
426
+ ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
427
+ ; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
428
428
; GFX7-NEXT: flat_store_dwordx2 v[5:6], v[0:1]
429
- ; GFX7-NEXT: v_mov_b32_e32 v1, v2
429
+ ; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
430
430
; GFX7-NEXT: s_waitcnt vmcnt(0)
431
431
; GFX7-NEXT: s_setpc_b64 s[30:31]
432
432
;
433
433
; GFX9-LABEL: fneg_xor_select_f64_multi_user:
434
434
; GFX9: ; %bb.0:
435
435
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436
436
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
437
+ ; GFX9-NEXT: v_mov_b32_e32 v7, v1
437
438
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
438
- ; GFX9-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc
439
- ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
440
- ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
439
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
440
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
441
441
; GFX9-NEXT: global_store_dwordx2 v[5:6], v[0:1], off
442
- ; GFX9-NEXT: v_mov_b32_e32 v1, v2
442
+ ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
443
443
; GFX9-NEXT: s_waitcnt vmcnt(0)
444
444
; GFX9-NEXT: s_setpc_b64 s[30:31]
445
445
;
446
446
; GFX11-LABEL: fneg_xor_select_f64_multi_user:
447
447
; GFX11: ; %bb.0:
448
448
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449
449
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
450
- ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
451
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2 ) | instid1(VALU_DEP_2)
450
+ ; GFX11-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
451
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT ) | instid1(VALU_DEP_2)
452
452
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
453
- ; GFX11-NEXT: v_cndmask_b32_e64 v2, - v4, -v2, vcc_lo
454
- ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
455
- ; GFX11-NEXT: v_xor_b32_e32 v1 , 0x80000000, v2
453
+ ; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
454
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
455
+ ; GFX11-NEXT: v_xor_b32_e32 v2 , 0x80000000, v1
456
456
; GFX11-NEXT: global_store_b64 v[5:6], v[0:1], off
457
457
; GFX11-NEXT: v_mov_b32_e32 v1, v2
458
458
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -496,13 +496,14 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
496
496
; GCN: ; %bb.0:
497
497
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498
498
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
499
- ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
500
499
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
501
500
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
501
+ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1
502
502
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
503
- ; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, -v5, vcc
503
+ ; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
504
+ ; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
504
505
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
505
- ; GCN-NEXT: v_cndmask_b32_e64 v1, - v2, v2 , vcc
506
+ ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3 , vcc
506
507
; GCN-NEXT: s_setpc_b64 s[30:31]
507
508
;
508
509
; GFX11-LABEL: select_fneg_select_fneg_f64:
@@ -511,13 +512,16 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
511
512
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
512
513
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
513
514
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
514
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
515
+ ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
516
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
515
517
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
516
- ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
517
- ; GFX11-NEXT: v_cndmask_b32_e64 v2, - v3, - v5, vcc_lo
518
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2 ) | instskip(NEXT) | instid1(VALU_DEP_2)
518
+ ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
519
+ ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
520
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4 ) | instskip(NEXT) | instid1(VALU_DEP_2)
519
521
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
520
- ; GFX11-NEXT: v_cndmask_b32_e64 v1, -v2, v2, vcc_lo
522
+ ; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
523
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
524
+ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
521
525
; GFX11-NEXT: s_setpc_b64 s[30:31]
522
526
%fneg0 = fneg double %arg0
523
527
%select0 = select i1 %cond0 , double %arg1 , double %fneg0
@@ -889,9 +893,10 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
889
893
; GCN-NEXT: v_and_b32_e32 v5, 1, v0
890
894
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
891
895
; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
892
- ; GCN-NEXT: v_cndmask_b32_e64 v1, -v2, -v4, vcc
896
+ ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
897
+ ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
893
898
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0
894
- ; GCN-NEXT: v_cndmask_b32_e64 v1, - v1, v1 , vcc
899
+ ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2 , vcc
895
900
; GCN-NEXT: v_mov_b32_e32 v0, v3
896
901
; GCN-NEXT: s_setpc_b64 s[30:31]
897
902
;
@@ -903,11 +908,12 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
903
908
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
904
909
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
905
910
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
906
- ; GFX11-NEXT: v_cndmask_b32_e64 v1, - v2, - v4, vcc_lo
911
+ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
907
912
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0
908
913
; GFX11-NEXT: v_mov_b32_e32 v0, v3
909
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
910
- ; GFX11-NEXT: v_cndmask_b32_e64 v1, -v1, v1, vcc_lo
914
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
915
+ ; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
916
+ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
911
917
; GFX11-NEXT: s_setpc_b64 s[30:31]
912
918
%i = and i32 %arg , 1
913
919
%i3 = icmp eq i32 %i , 0
@@ -1513,3 +1519,133 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo
1513
1519
%ret.1 = insertvalue { double , double } %ret.0 , double %other.bitcast.user , 1
1514
1520
ret { double , double } %ret.1
1515
1521
}
1522
+
1523
+ ; Check for correct bitcasting back when there are multiple uses
1524
+ define amdgpu_kernel void @multiple_uses_fneg_select_f64 (double %x , double %y , i1 %z , ptr addrspace (1 ) %dst ) {
1525
+ ; GFX7-LABEL: multiple_uses_fneg_select_f64:
1526
+ ; GFX7: ; %bb.0:
1527
+ ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x4
1528
+ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1529
+ ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x6
1530
+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1531
+ ; GFX7-NEXT: s_bitcmp1_b32 s6, 0
1532
+ ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
1533
+ ; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec
1534
+ ; GFX7-NEXT: v_mov_b32_e32 v0, s3
1535
+ ; GFX7-NEXT: v_mov_b32_e32 v1, s1
1536
+ ; GFX7-NEXT: s_cselect_b32 s1, s1, s3
1537
+ ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1538
+ ; GFX7-NEXT: s_cselect_b32 s0, s0, s2
1539
+ ; GFX7-NEXT: v_mov_b32_e32 v1, s1
1540
+ ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1541
+ ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
1542
+ ; GFX7-NEXT: v_mov_b32_e32 v0, s0
1543
+ ; GFX7-NEXT: v_mov_b32_e32 v3, s5
1544
+ ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1545
+ ; GFX7-NEXT: s_endpgm
1546
+ ;
1547
+ ; GFX9-LABEL: multiple_uses_fneg_select_f64:
1548
+ ; GFX9: ; %bb.0:
1549
+ ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1550
+ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1551
+ ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1552
+ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x18
1553
+ ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1554
+ ; GFX9-NEXT: s_bitcmp1_b32 s6, 0
1555
+ ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
1556
+ ; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
1557
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s3
1558
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1559
+ ; GFX9-NEXT: s_cselect_b32 s1, s1, s3
1560
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1561
+ ; GFX9-NEXT: s_cselect_b32 s0, s0, s2
1562
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1563
+ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
1564
+ ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1565
+ ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1566
+ ; GFX9-NEXT: s_endpgm
1567
+ ;
1568
+ ; GFX11-LABEL: multiple_uses_fneg_select_f64:
1569
+ ; GFX11: ; %bb.0:
1570
+ ; GFX11-NEXT: s_clause 0x2
1571
+ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1572
+ ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10
1573
+ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x18
1574
+ ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1575
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1576
+ ; GFX11-NEXT: v_mov_b32_e32 v0, s5
1577
+ ; GFX11-NEXT: s_bitcmp1_b32 s2, 0
1578
+ ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
1579
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1580
+ ; GFX11-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo
1581
+ ; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
1582
+ ; GFX11-NEXT: s_cselect_b32 s2, s5, s7
1583
+ ; GFX11-NEXT: s_cselect_b32 s3, s4, s6
1584
+ ; GFX11-NEXT: v_cndmask_b32_e64 v1, s2, -v0, vcc_lo
1585
+ ; GFX11-NEXT: v_mov_b32_e32 v0, s3
1586
+ ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1587
+ ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1588
+ ; GFX11-NEXT: s_endpgm
1589
+ %a = select i1 %z , double %x , double %y
1590
+ %b = fneg double %a
1591
+ %c = select i1 %z , double %a , double %b
1592
+ %d = fneg double %c
1593
+ store double %d , ptr addrspace (1 ) %dst
1594
+ ret void
1595
+ }
1596
+
1597
+ define amdgpu_kernel void @fnge_select_f32_multi_use_regression (float %.i2369 ) {
1598
+ ; GCN-LABEL: fnge_select_f32_multi_use_regression:
1599
+ ; GCN: ; %bb.0: ; %.entry
1600
+ ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
1601
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1602
+ ; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 0
1603
+ ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1604
+ ; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
1605
+ ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
1606
+ ; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
1607
+ ; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
1608
+ ; GCN-NEXT: s_and_b64 vcc, exec, vcc
1609
+ ; GCN-NEXT: s_endpgm
1610
+ ;
1611
+ ; GFX11-LABEL: fnge_select_f32_multi_use_regression:
1612
+ ; GFX11: ; %bb.0: ; %.entry
1613
+ ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
1614
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1615
+ ; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, s0, 0
1616
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1617
+ ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
1618
+ ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0
1619
+ ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc_lo
1620
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1621
+ ; GFX11-NEXT: v_mul_f32_e64 v0, -v0, v1
1622
+ ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0, v0
1623
+ ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
1624
+ ; GFX11-NEXT: s_endpgm
1625
+ .entry:
1626
+ %i = fcmp uge float %.i2369 , 0 .000000e+00
1627
+ %.i2379 = select i1 %i , i32 1 , i32 0
1628
+ %.i0436 = bitcast i32 %.i2379 to float
1629
+ %.i0440 = fneg float %.i0436
1630
+ %i1 = fcmp uge float %.i0436 , 0 .000000e+00
1631
+ %.i2495 = select i1 %i1 , i32 %.i2379 , i32 0
1632
+ %.i0552 = bitcast i32 %.i2495 to float
1633
+ %.i0592 = fmul float %.i0440 , %.i0552
1634
+ %.i0721 = fcmp ogt float %.i0592 , 0 .000000e+00
1635
+ br i1 %.i0721 , label %bb5 , label %bb
1636
+
1637
+ bb: ; preds = %.entry
1638
+ %i2 = call <2 x i32 > @llvm.amdgcn.s.buffer.load.v2i32 (<4 x i32 > zeroinitializer , i32 1 , i32 0 )
1639
+ %i3 = shufflevector <2 x i32 > %i2 , <2 x i32 > zeroinitializer , <4 x i32 > <i32 0 , i32 1 , i32 undef , i32 undef >
1640
+ %i4 = bitcast <4 x i32 > %i3 to <4 x float >
1641
+ %.i0753 = extractelement <4 x float > %i4 , i64 0
1642
+ br label %bb5
1643
+
1644
+ bb5: ; preds = %bb, %.entry
1645
+ ret void
1646
+ }
1647
+
1648
+
1649
+ declare <2 x i32 > @llvm.amdgcn.s.buffer.load.v2i32 (<4 x i32 >, i32 , i32 immarg) #0
1650
+
1651
+ attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
0 commit comments