Skip to content

Commit 2fce50e

Browse files
committed
AMDGPU: Fix assertion with multiple uses of f64 fneg of select
A bitcast needs to be inserted back to the original type. Just skip the multiple use case for a safer quick fix. Handling the multiple use case seems to be beneficial in some but not all cases.
1 parent b74aeac commit 2fce50e

File tree

2 files changed

+169
-33
lines changed

2 files changed

+169
-33
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4185,9 +4185,13 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
41854185
return Result;
41864186
}
41874187

4188-
if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32) {
4188+
if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4189+
BCSrc.hasOneUse()) {
41894190
// fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
41904191
// select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4192+
4193+
// TODO: Cast back result for multiple uses is beneficial in some cases.
4194+
41914195
SDValue LHS =
41924196
DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
41934197
SDValue RHS =
@@ -4196,12 +4200,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
41964200
SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
41974201
SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
41984202

4199-
SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, MVT::f32,
4200-
BCSrc.getOperand(0), NegLHS, NegRHS);
4201-
if (!BCSrc.hasOneUse())
4202-
DAG.ReplaceAllUsesWith(BCSrc,
4203-
DAG.getNode(ISD::FNEG, SL, VT, NewSelect));
4204-
return NewSelect;
4203+
return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4204+
NegRHS);
42054205
}
42064206

42074207
return SDValue();

llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll

Lines changed: 162 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -421,38 +421,38 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
421421
; GFX7: ; %bb.0:
422422
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
423423
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
424+
; GFX7-NEXT: v_mov_b32_e32 v7, v1
424425
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
425-
; GFX7-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc
426-
; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
427-
; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
426+
; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
427+
; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
428428
; GFX7-NEXT: flat_store_dwordx2 v[5:6], v[0:1]
429-
; GFX7-NEXT: v_mov_b32_e32 v1, v2
429+
; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
430430
; GFX7-NEXT: s_waitcnt vmcnt(0)
431431
; GFX7-NEXT: s_setpc_b64 s[30:31]
432432
;
433433
; GFX9-LABEL: fneg_xor_select_f64_multi_user:
434434
; GFX9: ; %bb.0:
435435
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436436
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
437+
; GFX9-NEXT: v_mov_b32_e32 v7, v1
437438
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
438-
; GFX9-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc
439-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
440-
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
439+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
440+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
441441
; GFX9-NEXT: global_store_dwordx2 v[5:6], v[0:1], off
442-
; GFX9-NEXT: v_mov_b32_e32 v1, v2
442+
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
443443
; GFX9-NEXT: s_waitcnt vmcnt(0)
444444
; GFX9-NEXT: s_setpc_b64 s[30:31]
445445
;
446446
; GFX11-LABEL: fneg_xor_select_f64_multi_user:
447447
; GFX11: ; %bb.0:
448448
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449449
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
450-
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
451-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
450+
; GFX11-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
451+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
452452
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
453-
; GFX11-NEXT: v_cndmask_b32_e64 v2, -v4, -v2, vcc_lo
454-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
455-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
453+
; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v2 :: v_dual_cndmask_b32 v0, v3, v7
454+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
455+
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
456456
; GFX11-NEXT: global_store_b64 v[5:6], v[0:1], off
457457
; GFX11-NEXT: v_mov_b32_e32 v1, v2
458458
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -496,13 +496,14 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
496496
; GCN: ; %bb.0:
497497
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498498
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
499-
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
500499
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
501500
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
501+
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
502502
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
503-
; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, -v5, vcc
503+
; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
504+
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
504505
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
505-
; GCN-NEXT: v_cndmask_b32_e64 v1, -v2, v2, vcc
506+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
506507
; GCN-NEXT: s_setpc_b64 s[30:31]
507508
;
508509
; GFX11-LABEL: select_fneg_select_fneg_f64:
@@ -511,13 +512,16 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
511512
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
512513
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
513514
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
514-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
515+
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
516+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
515517
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
516-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
517-
; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, -v5, vcc_lo
518-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
518+
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
519+
; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
520+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
519521
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
520-
; GFX11-NEXT: v_cndmask_b32_e64 v1, -v2, v2, vcc_lo
522+
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
523+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
524+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
521525
; GFX11-NEXT: s_setpc_b64 s[30:31]
522526
%fneg0 = fneg double %arg0
523527
%select0 = select i1 %cond0, double %arg1, double %fneg0
@@ -889,9 +893,10 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
889893
; GCN-NEXT: v_and_b32_e32 v5, 1, v0
890894
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
891895
; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
892-
; GCN-NEXT: v_cndmask_b32_e64 v1, -v2, -v4, vcc
896+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
897+
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
893898
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0
894-
; GCN-NEXT: v_cndmask_b32_e64 v1, -v1, v1, vcc
899+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
895900
; GCN-NEXT: v_mov_b32_e32 v0, v3
896901
; GCN-NEXT: s_setpc_b64 s[30:31]
897902
;
@@ -903,11 +908,12 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
903908
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
904909
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
905910
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
906-
; GFX11-NEXT: v_cndmask_b32_e64 v1, -v2, -v4, vcc_lo
911+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
907912
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0
908913
; GFX11-NEXT: v_mov_b32_e32 v0, v3
909-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
910-
; GFX11-NEXT: v_cndmask_b32_e64 v1, -v1, v1, vcc_lo
914+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
915+
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
916+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
911917
; GFX11-NEXT: s_setpc_b64 s[30:31]
912918
%i = and i32 %arg, 1
913919
%i3 = icmp eq i32 %i, 0
@@ -1513,3 +1519,133 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_fo
15131519
%ret.1 = insertvalue { double, double } %ret.0, double %other.bitcast.user, 1
15141520
ret { double, double } %ret.1
15151521
}
1522+
1523+
; Check for correct bitcasting back when there are multiple uses
1524+
define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i1 %z, ptr addrspace(1) %dst) {
1525+
; GFX7-LABEL: multiple_uses_fneg_select_f64:
1526+
; GFX7: ; %bb.0:
1527+
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x4
1528+
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1529+
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x6
1530+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1531+
; GFX7-NEXT: s_bitcmp1_b32 s6, 0
1532+
; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
1533+
; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec
1534+
; GFX7-NEXT: v_mov_b32_e32 v0, s3
1535+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
1536+
; GFX7-NEXT: s_cselect_b32 s1, s1, s3
1537+
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1538+
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
1539+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
1540+
; GFX7-NEXT: v_mov_b32_e32 v2, s4
1541+
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
1542+
; GFX7-NEXT: v_mov_b32_e32 v0, s0
1543+
; GFX7-NEXT: v_mov_b32_e32 v3, s5
1544+
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1545+
; GFX7-NEXT: s_endpgm
1546+
;
1547+
; GFX9-LABEL: multiple_uses_fneg_select_f64:
1548+
; GFX9: ; %bb.0:
1549+
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1550+
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1551+
; GFX9-NEXT: v_mov_b32_e32 v2, 0
1552+
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x18
1553+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1554+
; GFX9-NEXT: s_bitcmp1_b32 s6, 0
1555+
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
1556+
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
1557+
; GFX9-NEXT: v_mov_b32_e32 v0, s3
1558+
; GFX9-NEXT: v_mov_b32_e32 v1, s1
1559+
; GFX9-NEXT: s_cselect_b32 s1, s1, s3
1560+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1561+
; GFX9-NEXT: s_cselect_b32 s0, s0, s2
1562+
; GFX9-NEXT: v_mov_b32_e32 v1, s1
1563+
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
1564+
; GFX9-NEXT: v_mov_b32_e32 v0, s0
1565+
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
1566+
; GFX9-NEXT: s_endpgm
1567+
;
1568+
; GFX11-LABEL: multiple_uses_fneg_select_f64:
1569+
; GFX11: ; %bb.0:
1570+
; GFX11-NEXT: s_clause 0x2
1571+
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1572+
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10
1573+
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x18
1574+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
1575+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1576+
; GFX11-NEXT: v_mov_b32_e32 v0, s5
1577+
; GFX11-NEXT: s_bitcmp1_b32 s2, 0
1578+
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
1579+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
1580+
; GFX11-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo
1581+
; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
1582+
; GFX11-NEXT: s_cselect_b32 s2, s5, s7
1583+
; GFX11-NEXT: s_cselect_b32 s3, s4, s6
1584+
; GFX11-NEXT: v_cndmask_b32_e64 v1, s2, -v0, vcc_lo
1585+
; GFX11-NEXT: v_mov_b32_e32 v0, s3
1586+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1587+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1588+
; GFX11-NEXT: s_endpgm
1589+
%a = select i1 %z, double %x, double %y
1590+
%b = fneg double %a
1591+
%c = select i1 %z, double %a, double %b
1592+
%d = fneg double %c
1593+
store double %d, ptr addrspace(1) %dst
1594+
ret void
1595+
}
1596+
1597+
define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) {
1598+
; GCN-LABEL: fnge_select_f32_multi_use_regression:
1599+
; GCN: ; %bb.0: ; %.entry
1600+
; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
1601+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1602+
; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 0
1603+
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1604+
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
1605+
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
1606+
; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1
1607+
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
1608+
; GCN-NEXT: s_and_b64 vcc, exec, vcc
1609+
; GCN-NEXT: s_endpgm
1610+
;
1611+
; GFX11-LABEL: fnge_select_f32_multi_use_regression:
1612+
; GFX11: ; %bb.0: ; %.entry
1613+
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
1614+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1615+
; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, s0, 0
1616+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1617+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
1618+
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0
1619+
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc_lo
1620+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1621+
; GFX11-NEXT: v_mul_f32_e64 v0, -v0, v1
1622+
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0, v0
1623+
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
1624+
; GFX11-NEXT: s_endpgm
1625+
.entry:
1626+
%i = fcmp uge float %.i2369, 0.000000e+00
1627+
%.i2379 = select i1 %i, i32 1, i32 0
1628+
%.i0436 = bitcast i32 %.i2379 to float
1629+
%.i0440 = fneg float %.i0436
1630+
%i1 = fcmp uge float %.i0436, 0.000000e+00
1631+
%.i2495 = select i1 %i1, i32 %.i2379, i32 0
1632+
%.i0552 = bitcast i32 %.i2495 to float
1633+
%.i0592 = fmul float %.i0440, %.i0552
1634+
%.i0721 = fcmp ogt float %.i0592, 0.000000e+00
1635+
br i1 %.i0721, label %bb5, label %bb
1636+
1637+
bb: ; preds = %.entry
1638+
%i2 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 1, i32 0)
1639+
%i3 = shufflevector <2 x i32> %i2, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1640+
%i4 = bitcast <4 x i32> %i3 to <4 x float>
1641+
%.i0753 = extractelement <4 x float> %i4, i64 0
1642+
br label %bb5
1643+
1644+
bb5: ; preds = %bb, %.entry
1645+
ret void
1646+
}
1647+
1648+
1649+
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #0
1650+
1651+
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }

0 commit comments

Comments
 (0)