Skip to content

[X86][StrictFP] Combine fcmp + select to fmin/fmax for some predicates #109512

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34176,10 +34176,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMAXS)
NODE_NAME_CASE(FMAX_SAE)
NODE_NAME_CASE(FMAXS_SAE)
NODE_NAME_CASE(STRICT_FMAX)
NODE_NAME_CASE(FMIN)
NODE_NAME_CASE(FMINS)
NODE_NAME_CASE(FMIN_SAE)
NODE_NAME_CASE(FMINS_SAE)
NODE_NAME_CASE(STRICT_FMIN)
NODE_NAME_CASE(FMAXC)
NODE_NAME_CASE(FMINC)
NODE_NAME_CASE(FRSQRT)
Expand Down Expand Up @@ -46494,17 +46496,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
if ((Cond.getOpcode() == ISD::SETCC ||
Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you can do this with STRICT_FSETCC because the SSE min/max instructions signal on quiet NaNs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct, removed.

VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
!isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget.hasSSE2() ||
(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
bool IsStrict = Cond->isStrictFPOpcode();
ISD::CondCode CC =
cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);

unsigned Opcode = 0;
// Check for x CC y ? x : y.
if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
switch (CC) {
default: break;
case ISD::SETULT:
Expand Down Expand Up @@ -46572,8 +46578,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
break;
}
// Check for x CC y ? y : x -- a min/max with reversed arms.
} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
DAG.isEqualTo(RHS, Cond.getOperand(0))) {
} else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
switch (CC) {
default: break;
case ISD::SETOGE:
Expand Down Expand Up @@ -46638,8 +46643,17 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}

if (Opcode)
if (Opcode) {
if (IsStrict) {
SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
: X86ISD::STRICT_FMAX,
DL, {N->getValueType(0), MVT::Other},
{Cond.getOperand(0), LHS, RHS});
DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
return Ret;
}
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
}
}

// Some mask scalar intrinsics rely on checking if only one bit is set
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -850,6 +850,10 @@ namespace llvm {
// Perform an FP80 add after changing precision control in FPCW.
STRICT_FP80_ADD,

/// Floating point max and min.
STRICT_FMAX,
STRICT_FMIN,

// WARNING: Only add nodes here if they are strict FP nodes. Non-memory and
// non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.

Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/X86/X86InstrAVX512.td
Original file line number Diff line number Diff line change
Expand Up @@ -5395,7 +5395,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode VecNode, SDNode SaeNode,
SDPatternOperator OpNode, SDNode VecNode, SDNode SaeNode,
X86FoldableSchedWrite sched, bit IsCommutable> {
let ExeDomain = _.ExeDomain in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
Expand Down Expand Up @@ -5458,7 +5458,7 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
}

multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode VecNode, SDNode SaeNode,
X86SchedWriteSizes sched, bit IsCommutable> {
defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
Expand All @@ -5481,9 +5481,9 @@ defm VSUB : avx512_binop_s_round<0x5C, "vsub", any_fsub, X86fsubs, X86fsubRnds,
SchedWriteFAddSizes, 0>;
defm VDIV : avx512_binop_s_round<0x5E, "vdiv", any_fdiv, X86fdivs, X86fdivRnds,
SchedWriteFDivSizes, 0>;
defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs,
defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs,
SchedWriteFCmpSizes, 0>;
defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs,
SchedWriteFCmpSizes, 0>;

// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,18 @@ def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp,
def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;

def X86strict_fmin : SDNode<"X86ISD::STRICT_FMIN", SDTFPBinOp,
[SDNPHasChain]>;
def X86strict_fmax : SDNode<"X86ISD::STRICT_FMAX", SDTFPBinOp,
[SDNPHasChain]>;

def X86any_fmin : PatFrags<(ops node:$src1, node:$src2),
[(X86strict_fmin node:$src1, node:$src2),
(X86fmin node:$src1, node:$src2)]>;
def X86any_fmax : PatFrags<(ops node:$src1, node:$src2),
[(X86strict_fmax node:$src1, node:$src2),
(X86fmax node:$src1, node:$src2)]>;

def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -2730,11 +2730,11 @@ let isCommutable = 0 in {
defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86any_fmax, SchedWriteFCmpSizes>,
basic_sse12_fp_binop_s<0x5F, "max", X86any_fmax, SchedWriteFCmpSizes>,
basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86any_fmin, SchedWriteFCmpSizes>,
basic_sse12_fp_binop_s<0x5D, "min", X86any_fmin, SchedWriteFCmpSizes>,
basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
}

Expand Down
149 changes: 148 additions & 1 deletion llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4202,7 +4202,154 @@ define void @foo(float %0, float %1) #0 {
}
declare dso_local void @bar()

attributes #0 = { strictfp }
define float @fcmp_select_ogt(float %f1, float %f2) #0 {
; SSE-32-LABEL: fcmp_select_ogt:
; SSE-32: # %bb.0:
; SSE-32-NEXT: pushl %eax
; SSE-32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-32-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; SSE-32-NEXT: movss %xmm0, (%esp)
; SSE-32-NEXT: flds (%esp)
; SSE-32-NEXT: wait
; SSE-32-NEXT: popl %eax
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: fcmp_select_ogt:
; SSE-64: # %bb.0:
; SSE-64-NEXT: maxss %xmm1, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: fcmp_select_ogt:
; AVX-32: # %bb.0:
; AVX-32-NEXT: pushl %eax
; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-32-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vmovss %xmm0, (%esp)
; AVX-32-NEXT: flds (%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: popl %eax
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: fcmp_select_ogt:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; X87-LABEL: fcmp_select_ogt:
; X87: # %bb.0:
; X87-NEXT: flds {{[0-9]+}}(%esp)
; X87-NEXT: flds {{[0-9]+}}(%esp)
; X87-NEXT: fcom %st(1)
; X87-NEXT: wait
; X87-NEXT: fnstsw %ax
; X87-NEXT: # kill: def $ah killed $ah killed $ax
; X87-NEXT: sahf
; X87-NEXT: ja .LBB57_2
; X87-NEXT: # %bb.1:
; X87-NEXT: fstp %st(0)
; X87-NEXT: fldz
; X87-NEXT: fxch %st(1)
; X87-NEXT: .LBB57_2:
; X87-NEXT: fstp %st(1)
; X87-NEXT: wait
; X87-NEXT: retl
;
; X87-CMOV-LABEL: fcmp_select_ogt:
; X87-CMOV: # %bb.0:
; X87-CMOV-NEXT: flds {{[0-9]+}}(%esp)
; X87-CMOV-NEXT: flds {{[0-9]+}}(%esp)
; X87-CMOV-NEXT: fcomi %st(1), %st
; X87-CMOV-NEXT: fxch %st(1)
; X87-CMOV-NEXT: fcmovnbe %st(1), %st
; X87-CMOV-NEXT: fstp %st(1)
; X87-CMOV-NEXT: wait
; X87-CMOV-NEXT: retl
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(
float %f1, float %f2, metadata !"ogt",
metadata !"fpexcept.strict")
%res = select i1 %cond, float %f1, float %f2
ret float %res
}

define double @fcmp_select_ule(double %f1, double %f2) #0 {
; SSE-32-LABEL: fcmp_select_ule:
; SSE-32: # %bb.0:
; SSE-32-NEXT: pushl %ebp
; SSE-32-NEXT: movl %esp, %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $8, %esp
; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-32-NEXT: minsd 8(%ebp), %xmm0
; SSE-32-NEXT: movsd %xmm0, (%esp)
; SSE-32-NEXT: fldl (%esp)
; SSE-32-NEXT: wait
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: fcmp_select_ule:
; SSE-64: # %bb.0:
; SSE-64-NEXT: minsd %xmm0, %xmm1
; SSE-64-NEXT: movapd %xmm1, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: fcmp_select_ule:
; AVX-32: # %bb.0:
; AVX-32-NEXT: pushl %ebp
; AVX-32-NEXT: movl %esp, %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $8, %esp
; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-32-NEXT: vminsd 8(%ebp), %xmm0, %xmm0
; AVX-32-NEXT: vmovsd %xmm0, (%esp)
; AVX-32-NEXT: fldl (%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: movl %ebp, %esp
; AVX-32-NEXT: popl %ebp
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: fcmp_select_ule:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vminsd %xmm0, %xmm1, %xmm0
; AVX-64-NEXT: retq
;
; X87-LABEL: fcmp_select_ule:
; X87: # %bb.0:
; X87-NEXT: fldl {{[0-9]+}}(%esp)
; X87-NEXT: fldl {{[0-9]+}}(%esp)
; X87-NEXT: fcom %st(1)
; X87-NEXT: wait
; X87-NEXT: fnstsw %ax
; X87-NEXT: # kill: def $ah killed $ah killed $ax
; X87-NEXT: sahf
; X87-NEXT: jbe .LBB58_2
; X87-NEXT: # %bb.1:
; X87-NEXT: fstp %st(0)
; X87-NEXT: fldz
; X87-NEXT: fxch %st(1)
; X87-NEXT: .LBB58_2:
; X87-NEXT: fstp %st(1)
; X87-NEXT: wait
; X87-NEXT: retl
;
; X87-CMOV-LABEL: fcmp_select_ule:
; X87-CMOV: # %bb.0:
; X87-CMOV-NEXT: fldl {{[0-9]+}}(%esp)
; X87-CMOV-NEXT: fldl {{[0-9]+}}(%esp)
; X87-CMOV-NEXT: fcomi %st(1), %st
; X87-CMOV-NEXT: fxch %st(1)
; X87-CMOV-NEXT: fcmovbe %st(1), %st
; X87-CMOV-NEXT: fstp %st(1)
; X87-CMOV-NEXT: wait
; X87-CMOV-NEXT: retl
%cond = call i1 @llvm.experimental.constrained.fcmps.f64(
double %f1, double %f2, metadata !"ule",
metadata !"fpexcept.strict")
%res = select i1 %cond, double %f1, double %f2
ret double %res
}

attributes #0 = { nounwind strictfp }

declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata)
declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)
Expand Down
Loading