Skip to content

Commit 2df7aa0

Browse files
phoebewangDanielCChen
authored andcommitted
[X86][StrictFP] Combine fcmp + select to fmin/fmax for some predicates (llvm#109512)
X86 maxss/minss etc. instructions won't turn SNaN to QNaN, so we can combine fcmp + select to them for some predicates.
1 parent 2824ef7 commit 2df7aa0

File tree

6 files changed

+195
-18
lines changed

6 files changed

+195
-18
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34219,10 +34219,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3421934219
NODE_NAME_CASE(FMAXS)
3422034220
NODE_NAME_CASE(FMAX_SAE)
3422134221
NODE_NAME_CASE(FMAXS_SAE)
34222+
NODE_NAME_CASE(STRICT_FMAX)
3422234223
NODE_NAME_CASE(FMIN)
3422334224
NODE_NAME_CASE(FMINS)
3422434225
NODE_NAME_CASE(FMIN_SAE)
3422534226
NODE_NAME_CASE(FMINS_SAE)
34227+
NODE_NAME_CASE(STRICT_FMIN)
3422634228
NODE_NAME_CASE(FMAXC)
3422734229
NODE_NAME_CASE(FMINC)
3422834230
NODE_NAME_CASE(FRSQRT)
@@ -46461,17 +46463,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
4646146463
// x<=y?x:y, because of how they handle negative zero (which can be
4646246464
// ignored in unsafe-math mode).
4646346465
// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
46464-
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
46465-
VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
46466-
(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
46466+
if ((Cond.getOpcode() == ISD::SETCC ||
46467+
Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
46468+
VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
46469+
!isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
4646746470
(Subtarget.hasSSE2() ||
4646846471
(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
46469-
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46472+
bool IsStrict = Cond->isStrictFPOpcode();
46473+
ISD::CondCode CC =
46474+
cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
46475+
SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
46476+
SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
4647046477

4647146478
unsigned Opcode = 0;
4647246479
// Check for x CC y ? x : y.
46473-
if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
46474-
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
46480+
if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
4647546481
switch (CC) {
4647646482
default: break;
4647746483
case ISD::SETULT:
@@ -46539,8 +46545,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
4653946545
break;
4654046546
}
4654146547
// Check for x CC y ? y : x -- a min/max with reversed arms.
46542-
} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
46543-
DAG.isEqualTo(RHS, Cond.getOperand(0))) {
46548+
} else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
4654446549
switch (CC) {
4654546550
default: break;
4654646551
case ISD::SETOGE:
@@ -46605,8 +46610,17 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
4660546610
}
4660646611
}
4660746612

46608-
if (Opcode)
46613+
if (Opcode) {
46614+
if (IsStrict) {
46615+
SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
46616+
: X86ISD::STRICT_FMAX,
46617+
DL, {N->getValueType(0), MVT::Other},
46618+
{Cond.getOperand(0), LHS, RHS});
46619+
DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
46620+
return Ret;
46621+
}
4660946622
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
46623+
}
4661046624
}
4661146625

4661246626
// Some mask scalar intrinsics rely on checking if only one bit is set

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,10 @@ namespace llvm {
850850
// Perform an FP80 add after changing precision control in FPCW.
851851
STRICT_FP80_ADD,
852852

853+
/// Floating point max and min.
854+
STRICT_FMAX,
855+
STRICT_FMIN,
856+
853857
// WARNING: Only add nodes here if they are strict FP nodes. Non-memory and
854858
// non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
855859

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5395,7 +5395,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
53955395
EVEX_B, EVEX_RC, Sched<[sched]>;
53965396
}
53975397
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
5398-
SDNode OpNode, SDNode VecNode, SDNode SaeNode,
5398+
SDPatternOperator OpNode, SDNode VecNode, SDNode SaeNode,
53995399
X86FoldableSchedWrite sched, bit IsCommutable> {
54005400
let ExeDomain = _.ExeDomain in {
54015401
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5458,7 +5458,7 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
54585458
T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
54595459
}
54605460

5461-
multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
5461+
multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
54625462
SDNode VecNode, SDNode SaeNode,
54635463
X86SchedWriteSizes sched, bit IsCommutable> {
54645464
defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
@@ -5481,9 +5481,9 @@ defm VSUB : avx512_binop_s_round<0x5C, "vsub", any_fsub, X86fsubs, X86fsubRnds,
54815481
SchedWriteFAddSizes, 0>;
54825482
defm VDIV : avx512_binop_s_round<0x5E, "vdiv", any_fdiv, X86fdivs, X86fdivRnds,
54835483
SchedWriteFDivSizes, 0>;
5484-
defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs,
5484+
defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs,
54855485
SchedWriteFCmpSizes, 0>;
5486-
defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
5486+
defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs,
54875487
SchedWriteFCmpSizes, 0>;
54885488

54895489
// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use

llvm/lib/Target/X86/X86InstrFragmentsSIMD.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,18 @@ def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp,
4646
def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
4747
[SDNPCommutative, SDNPAssociative]>;
4848

49+
def X86strict_fmin : SDNode<"X86ISD::STRICT_FMIN", SDTFPBinOp,
50+
[SDNPHasChain]>;
51+
def X86strict_fmax : SDNode<"X86ISD::STRICT_FMAX", SDTFPBinOp,
52+
[SDNPHasChain]>;
53+
54+
def X86any_fmin : PatFrags<(ops node:$src1, node:$src2),
55+
[(X86strict_fmin node:$src1, node:$src2),
56+
(X86fmin node:$src1, node:$src2)]>;
57+
def X86any_fmax : PatFrags<(ops node:$src1, node:$src2),
58+
[(X86strict_fmax node:$src1, node:$src2),
59+
(X86fmax node:$src1, node:$src2)]>;
60+
4961
def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
5062
[SDNPCommutative, SDNPAssociative]>;
5163
def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2730,11 +2730,11 @@ let isCommutable = 0 in {
27302730
defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
27312731
basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
27322732
basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2733-
defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2734-
basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2733+
defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86any_fmax, SchedWriteFCmpSizes>,
2734+
basic_sse12_fp_binop_s<0x5F, "max", X86any_fmax, SchedWriteFCmpSizes>,
27352735
basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2736-
defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2737-
basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2736+
defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86any_fmin, SchedWriteFCmpSizes>,
2737+
basic_sse12_fp_binop_s<0x5D, "min", X86any_fmin, SchedWriteFCmpSizes>,
27382738
basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
27392739
}
27402740

llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll

Lines changed: 148 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4202,7 +4202,154 @@ define void @foo(float %0, float %1) #0 {
42024202
}
42034203
declare dso_local void @bar()
42044204

4205-
attributes #0 = { strictfp }
4205+
define float @fcmp_select_ogt(float %f1, float %f2) #0 {
4206+
; SSE-32-LABEL: fcmp_select_ogt:
4207+
; SSE-32: # %bb.0:
4208+
; SSE-32-NEXT: pushl %eax
4209+
; SSE-32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
4210+
; SSE-32-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
4211+
; SSE-32-NEXT: movss %xmm0, (%esp)
4212+
; SSE-32-NEXT: flds (%esp)
4213+
; SSE-32-NEXT: wait
4214+
; SSE-32-NEXT: popl %eax
4215+
; SSE-32-NEXT: retl
4216+
;
4217+
; SSE-64-LABEL: fcmp_select_ogt:
4218+
; SSE-64: # %bb.0:
4219+
; SSE-64-NEXT: maxss %xmm1, %xmm0
4220+
; SSE-64-NEXT: retq
4221+
;
4222+
; AVX-32-LABEL: fcmp_select_ogt:
4223+
; AVX-32: # %bb.0:
4224+
; AVX-32-NEXT: pushl %eax
4225+
; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
4226+
; AVX-32-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm0
4227+
; AVX-32-NEXT: vmovss %xmm0, (%esp)
4228+
; AVX-32-NEXT: flds (%esp)
4229+
; AVX-32-NEXT: wait
4230+
; AVX-32-NEXT: popl %eax
4231+
; AVX-32-NEXT: retl
4232+
;
4233+
; AVX-64-LABEL: fcmp_select_ogt:
4234+
; AVX-64: # %bb.0:
4235+
; AVX-64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
4236+
; AVX-64-NEXT: retq
4237+
;
4238+
; X87-LABEL: fcmp_select_ogt:
4239+
; X87: # %bb.0:
4240+
; X87-NEXT: flds {{[0-9]+}}(%esp)
4241+
; X87-NEXT: flds {{[0-9]+}}(%esp)
4242+
; X87-NEXT: fcom %st(1)
4243+
; X87-NEXT: wait
4244+
; X87-NEXT: fnstsw %ax
4245+
; X87-NEXT: # kill: def $ah killed $ah killed $ax
4246+
; X87-NEXT: sahf
4247+
; X87-NEXT: ja .LBB57_2
4248+
; X87-NEXT: # %bb.1:
4249+
; X87-NEXT: fstp %st(0)
4250+
; X87-NEXT: fldz
4251+
; X87-NEXT: fxch %st(1)
4252+
; X87-NEXT: .LBB57_2:
4253+
; X87-NEXT: fstp %st(1)
4254+
; X87-NEXT: wait
4255+
; X87-NEXT: retl
4256+
;
4257+
; X87-CMOV-LABEL: fcmp_select_ogt:
4258+
; X87-CMOV: # %bb.0:
4259+
; X87-CMOV-NEXT: flds {{[0-9]+}}(%esp)
4260+
; X87-CMOV-NEXT: flds {{[0-9]+}}(%esp)
4261+
; X87-CMOV-NEXT: fcomi %st(1), %st
4262+
; X87-CMOV-NEXT: fxch %st(1)
4263+
; X87-CMOV-NEXT: fcmovnbe %st(1), %st
4264+
; X87-CMOV-NEXT: fstp %st(1)
4265+
; X87-CMOV-NEXT: wait
4266+
; X87-CMOV-NEXT: retl
4267+
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(
4268+
float %f1, float %f2, metadata !"ogt",
4269+
metadata !"fpexcept.strict")
4270+
%res = select i1 %cond, float %f1, float %f2
4271+
ret float %res
4272+
}
4273+
4274+
define double @fcmp_select_ule(double %f1, double %f2) #0 {
4275+
; SSE-32-LABEL: fcmp_select_ule:
4276+
; SSE-32: # %bb.0:
4277+
; SSE-32-NEXT: pushl %ebp
4278+
; SSE-32-NEXT: movl %esp, %ebp
4279+
; SSE-32-NEXT: andl $-8, %esp
4280+
; SSE-32-NEXT: subl $8, %esp
4281+
; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
4282+
; SSE-32-NEXT: minsd 8(%ebp), %xmm0
4283+
; SSE-32-NEXT: movsd %xmm0, (%esp)
4284+
; SSE-32-NEXT: fldl (%esp)
4285+
; SSE-32-NEXT: wait
4286+
; SSE-32-NEXT: movl %ebp, %esp
4287+
; SSE-32-NEXT: popl %ebp
4288+
; SSE-32-NEXT: retl
4289+
;
4290+
; SSE-64-LABEL: fcmp_select_ule:
4291+
; SSE-64: # %bb.0:
4292+
; SSE-64-NEXT: minsd %xmm0, %xmm1
4293+
; SSE-64-NEXT: movapd %xmm1, %xmm0
4294+
; SSE-64-NEXT: retq
4295+
;
4296+
; AVX-32-LABEL: fcmp_select_ule:
4297+
; AVX-32: # %bb.0:
4298+
; AVX-32-NEXT: pushl %ebp
4299+
; AVX-32-NEXT: movl %esp, %ebp
4300+
; AVX-32-NEXT: andl $-8, %esp
4301+
; AVX-32-NEXT: subl $8, %esp
4302+
; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
4303+
; AVX-32-NEXT: vminsd 8(%ebp), %xmm0, %xmm0
4304+
; AVX-32-NEXT: vmovsd %xmm0, (%esp)
4305+
; AVX-32-NEXT: fldl (%esp)
4306+
; AVX-32-NEXT: wait
4307+
; AVX-32-NEXT: movl %ebp, %esp
4308+
; AVX-32-NEXT: popl %ebp
4309+
; AVX-32-NEXT: retl
4310+
;
4311+
; AVX-64-LABEL: fcmp_select_ule:
4312+
; AVX-64: # %bb.0:
4313+
; AVX-64-NEXT: vminsd %xmm0, %xmm1, %xmm0
4314+
; AVX-64-NEXT: retq
4315+
;
4316+
; X87-LABEL: fcmp_select_ule:
4317+
; X87: # %bb.0:
4318+
; X87-NEXT: fldl {{[0-9]+}}(%esp)
4319+
; X87-NEXT: fldl {{[0-9]+}}(%esp)
4320+
; X87-NEXT: fcom %st(1)
4321+
; X87-NEXT: wait
4322+
; X87-NEXT: fnstsw %ax
4323+
; X87-NEXT: # kill: def $ah killed $ah killed $ax
4324+
; X87-NEXT: sahf
4325+
; X87-NEXT: jbe .LBB58_2
4326+
; X87-NEXT: # %bb.1:
4327+
; X87-NEXT: fstp %st(0)
4328+
; X87-NEXT: fldz
4329+
; X87-NEXT: fxch %st(1)
4330+
; X87-NEXT: .LBB58_2:
4331+
; X87-NEXT: fstp %st(1)
4332+
; X87-NEXT: wait
4333+
; X87-NEXT: retl
4334+
;
4335+
; X87-CMOV-LABEL: fcmp_select_ule:
4336+
; X87-CMOV: # %bb.0:
4337+
; X87-CMOV-NEXT: fldl {{[0-9]+}}(%esp)
4338+
; X87-CMOV-NEXT: fldl {{[0-9]+}}(%esp)
4339+
; X87-CMOV-NEXT: fcomi %st(1), %st
4340+
; X87-CMOV-NEXT: fxch %st(1)
4341+
; X87-CMOV-NEXT: fcmovbe %st(1), %st
4342+
; X87-CMOV-NEXT: fstp %st(1)
4343+
; X87-CMOV-NEXT: wait
4344+
; X87-CMOV-NEXT: retl
4345+
%cond = call i1 @llvm.experimental.constrained.fcmps.f64(
4346+
double %f1, double %f2, metadata !"ule",
4347+
metadata !"fpexcept.strict")
4348+
%res = select i1 %cond, double %f1, double %f2
4349+
ret double %res
4350+
}
4351+
4352+
attributes #0 = { nounwind strictfp }
42064353

42074354
declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata)
42084355
declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)

0 commit comments

Comments
 (0)