Skip to content

Commit 8bdc32c

Browse files
committed
Tighten up the code for performFNEGCombine to limit the applicable types
1 parent 9550d6a commit 8bdc32c

File tree

3 files changed

+36
-28
lines changed

3 files changed

+36
-28
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5066,15 +5066,19 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
50665066
}
50675067
case ISD::SELECT: {
50685068
// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5069-
// This combine became necessary recently to prevent a regression after v2i32 xor was made legal.
5070-
// When adding this combine a case was added to performFNEGCombine to prevent this combine from
5071-
// being undone under certain conditions.
5069+
// This combine became necessary recently to prevent a regression caused by
5070+
// this patch legalising v2i32 xor. When adding this combine a case was
5071+
// added to performFNEGCombine to prevent this combine from being undone
5072+
// under certain conditions.
50725073
// TODO: Invert conditions of foldFreeOpFromSelect
50735074
SDValue Cond = N0.getOperand(0);
50745075
SDValue LHS = N0.getOperand(1);
50755076
SDValue RHS = N0.getOperand(2);
50765077
EVT LHVT = LHS.getValueType();
50775078
EVT RHVT = RHS.getValueType();
5079+
// The regression was limited to i32 v2/i32.
5080+
if(RHVT != MVT::i32 && LHVT != MVT::i32)
5081+
return SDValue();
50785082

50795083
SDValue LFNeg = DAG.getNode(ISD::FNEG, SL, LHVT, LHS);
50805084
SDValue RFNeg = DAG.getNode(ISD::FNEG, SL, RHVT, RHS);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5940,7 +5940,6 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
59405940
// Enable lowering of ROTR for vxi32 types. This is a workaround for a
59415941
// regression caused by legalising v2i32 or.
59425942
SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
5943-
unsigned Opc = Op.getOpcode();
59445943
EVT VT = Op.getValueType();
59455944

59465945
assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||

llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -416,22 +416,19 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
416416
; GCN: ; %bb.0:
417417
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
418418
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
419-
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
420-
; GCN-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
421419
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
422420
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
423-
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
421+
; GCN-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc
424422
; GCN-NEXT: s_setpc_b64 s[30:31]
425423
;
426424
; GFX11-LABEL: fneg_xor_select_f64:
427425
; GFX11: ; %bb.0:
428426
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
429-
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
430-
; GFX11-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
431427
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
432-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
428+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
433429
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
434-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
430+
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
431+
; GFX11-NEXT: v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
435432
; GFX11-NEXT: s_setpc_b64 s[30:31]
436433
%select = select i1 %cond, double %arg0, double %arg1
437434
%fneg = fneg double %select
@@ -1642,16 +1639,19 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
16421639
; GFX7-NEXT: s_add_i32 s12, s12, s17
16431640
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
16441641
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1645-
; GFX7-NEXT: s_and_b32 s6, 1, s6
1642+
; GFX7-NEXT: s_bitcmp1_b32 s6, 0
1643+
; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
1644+
; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec
1645+
; GFX7-NEXT: v_mov_b32_e32 v0, s3
1646+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
16461647
; GFX7-NEXT: s_cselect_b32 s1, s1, s3
1647-
; GFX7-NEXT: s_xor_b32 s3, s1, 0x80000000
1648-
; GFX7-NEXT: s_cmp_eq_u32 s6, 1
1648+
; GFX7-NEXT: v_cndmask_b32_e64 v0, -v0, -v1, vcc
16491649
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
1650-
; GFX7-NEXT: s_cselect_b32 s1, s3, s1
1650+
; GFX7-NEXT: v_mov_b32_e32 v1, s1
16511651
; GFX7-NEXT: v_mov_b32_e32 v2, s4
16521652
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
1653+
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
16531654
; GFX7-NEXT: v_mov_b32_e32 v0, s0
1654-
; GFX7-NEXT: v_mov_b32_e32 v1, s1
16551655
; GFX7-NEXT: v_mov_b32_e32 v3, s5
16561656
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
16571657
; GFX7-NEXT: s_endpgm
@@ -1663,32 +1663,37 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
16631663
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
16641664
; GFX9-NEXT: v_mov_b32_e32 v2, 0
16651665
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1666-
; GFX9-NEXT: s_and_b32 s6, 1, s6
1666+
; GFX9-NEXT: s_bitcmp1_b32 s6, 0
1667+
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
1668+
; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec
1669+
; GFX9-NEXT: v_mov_b32_e32 v0, s3
1670+
; GFX9-NEXT: v_mov_b32_e32 v1, s1
16671671
; GFX9-NEXT: s_cselect_b32 s1, s1, s3
1668-
; GFX9-NEXT: s_xor_b32 s3, s1, 0x80000000
1669-
; GFX9-NEXT: s_cmp_eq_u32 s6, 1
1672+
; GFX9-NEXT: v_cndmask_b32_e64 v0, -v0, -v1, vcc
16701673
; GFX9-NEXT: s_cselect_b32 s0, s0, s2
1671-
; GFX9-NEXT: s_cselect_b32 s1, s3, s1
1672-
; GFX9-NEXT: v_mov_b32_e32 v0, s0
16731674
; GFX9-NEXT: v_mov_b32_e32 v1, s1
1675+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
1676+
; GFX9-NEXT: v_mov_b32_e32 v0, s0
16741677
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
16751678
; GFX9-NEXT: s_endpgm
16761679
;
16771680
; GFX11-LABEL: multiple_uses_fneg_select_f64:
16781681
; GFX11: ; %bb.0:
16791682
; GFX11-NEXT: s_clause 0x2
1680-
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10
16811683
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1684+
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10
16821685
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x18
16831686
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1684-
; GFX11-NEXT: s_and_b32 s6, 1, s6
1687+
; GFX11-NEXT: v_mov_b32_e32 v0, s1
1688+
; GFX11-NEXT: s_bitcmp1_b32 s6, 0
1689+
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
1690+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1691+
; GFX11-NEXT: v_cndmask_b32_e64 v0, -s3, -v0, vcc_lo
1692+
; GFX11-NEXT: s_and_b32 s6, vcc_lo, exec_lo
16851693
; GFX11-NEXT: s_cselect_b32 s1, s1, s3
1686-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1687-
; GFX11-NEXT: s_xor_b32 s3, s1, 0x80000000
1688-
; GFX11-NEXT: s_cmp_eq_u32 s6, 1
16891694
; GFX11-NEXT: s_cselect_b32 s0, s0, s2
1690-
; GFX11-NEXT: s_cselect_b32 s1, s3, s1
1691-
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
1695+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1696+
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, s1, v0
16921697
; GFX11-NEXT: v_mov_b32_e32 v0, s0
16931698
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
16941699
; GFX11-NEXT: s_endpgm

0 commit comments

Comments
 (0)