-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[X86] Support lowering of FMINIMUMNUM/FMAXIMUMNUM #121464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesPatch is 103.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121464.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index db21e708970648..de1bec1c130d20 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -402,6 +402,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
case ISD::FCOPYSIGN:
case ISD::FSQRT:
case ISD::FSIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 107454a92e356c..780eba16c9c498 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -149,6 +149,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
case ISD::FLDEXP:
case ISD::ABDS:
case ISD::ABDU:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a0514e93d6598b..219674b1e84131 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -623,6 +623,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXNUM, VT, Action);
setOperationAction(ISD::FMINIMUM, VT, Action);
setOperationAction(ISD::FMAXIMUM, VT, Action);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Action);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
setOperationAction(ISD::FSIN, VT, Action);
setOperationAction(ISD::FCOS, VT, Action);
setOperationAction(ISD::FSINCOS, VT, Action);
@@ -1066,6 +1068,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
@@ -1108,6 +1112,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
}
for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
@@ -1473,6 +1479,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Custom);
}
@@ -1818,6 +1826,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FMA, VT, Legal);
@@ -2289,6 +2299,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
@@ -2336,6 +2348,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
}
if (Subtarget.hasVLX()) {
@@ -2383,9 +2397,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
}
}
@@ -2444,6 +2462,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMAXIMUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
}
if (Subtarget.hasAVX10_2_512()) {
setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
@@ -2455,6 +2475,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
}
for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
setCondCodeAction(ISD::SETOEQ, VT, Custom);
@@ -28839,13 +28861,15 @@ static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
- "Expected FMAXIMUM or FMINIMUM opcode");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = Op.getValueType();
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
SDLoc DL(Op);
+ bool IsMaxOp =
+ Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
+ bool IsNum =
+ Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
unsigned Opc = 0;
if (VT.isVector())
@@ -28855,7 +28879,7 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
if (Opc) {
SDValue Imm =
- DAG.getTargetConstant(Op.getOpcode() == ISD::FMAXIMUM, DL, MVT::i32);
+ DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
}
}
@@ -28865,7 +28889,7 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
APInt OppositeZero = PreferredZero;
EVT IVT = VT.changeTypeToInteger();
X86ISD::NodeType MinMaxOp;
- if (Op.getOpcode() == ISD::FMAXIMUM) {
+ if (IsMaxOp) {
MinMaxOp = X86ISD::FMAX;
OppositeZero.setSignBit();
} else {
@@ -28995,7 +29019,9 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
return MinMax;
- SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
+ SDValue IsNaN =
+ DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+
return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
}
@@ -33253,6 +33279,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
case ISD::ABDS:
@@ -45994,6 +46022,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
case ISD::FMAXNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
+ case ISD::FMAXIMUMNUM:
+ case ISD::FMINIMUMNUM:
case X86ISD::FMAX:
case X86ISD::FMIN:
case ISD::FABS: // Begin 1 operand
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
new file mode 100644
index 00000000000000..b183994280a474
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -0,0 +1,2553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86
+
+declare float @llvm.maximumnum.f32(float, float)
+declare double @llvm.maximumnum.f64(double, double)
+declare float @llvm.minimumnum.f32(float, float)
+declare double @llvm.minimumnum.f64(double, double)
+declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>)
+declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+
+;
+; fmaximumnum
+;
+
+define float @test_fmaximumnum(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: js .LBB0_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: .LBB0_2:
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: cmpordss %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: js .LBB0_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: .LBB0_4:
+; SSE2-NEXT: maxss %xmm1, %xmm3
+; SSE2-NEXT: andnps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: test_fmaximumnum:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: testl %eax, %eax
+; AVX1-NEXT: js .LBB0_1
+; AVX1-NEXT: # %bb.2:
+; AVX1-NEXT: vmovdqa %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB0_3
+; AVX1-NEXT: .LBB0_1:
+; AVX1-NEXT: vmovdqa %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa %xmm0, %xmm1
+; AVX1-NEXT: .LBB0_3:
+; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: test_fmaximumnum:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: testl %eax, %eax
+; AVX512-NEXT: sets %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovdqa %xmm0, %xmm2
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum:
+; X86: # %bb.0:
+; X86-NEXT: pushl %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm2, %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: js .LBB0_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: vmovdqa %xmm2, %xmm1
+; X86-NEXT: jmp .LBB0_3
+; X86-NEXT: .LBB0_1:
+; X86-NEXT: vmovdqa %xmm0, %xmm1
+; X86-NEXT: vmovdqa %xmm2, %xmm0
+; X86-NEXT: .LBB0_3:
+; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: flds (%esp)
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+ %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
+ ret float %1
+}
+
+define <4 x float> @test_fmaximumnum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fmaximumnum_scalarize:
+; SSE2: # %bb.0:
+; SSE2-NEXT: maxps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_fmaximumnum_scalarize:
+; AVX: # %bb.0:
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_scalarize:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_scalarize:
+; X86: # %bb.0:
+; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; X86-NEXT: retl
+ %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+ ret <4 x float> %r
+}
+
+define float @test_fmaximumnum_nan0(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan0:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_fmaximumnum_nan0:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan0:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_nan0:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: retl
+ %1 = tail call float @llvm.maximumnum.f32(float 0x7fff000000000000, float %y)
+ ret float %1
+}
+
+define float @test_fmaximumnum_nan1(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_fmaximumnum_nan1:
+; AVX: # %bb.0:
+; AVX-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan1:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_nan1:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: retl
+ %1 = tail call float @llvm.maximumnum.f32(float %x, float 0x7fff000000000000)
+ ret float %1
+}
+
+define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_nnan:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: addss %xmm1, %xmm2
+; SSE2-NEXT: subss %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: js .LBB4_1
+; SSE2-NEXT: # %bb.2:
+; SSE2-NEXT: maxss %xmm2, %xmm0
+; SSE2-NEXT: retq
+; SSE2-NEXT: .LBB4_1:
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: maxss %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: test_fmaximumnum_nnan:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: testl %eax, %eax
+; AVX1-NEXT: js .LBB4_1
+; AVX1-NEXT: # %bb.2:
+; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB4_1:
+; AVX1-NEXT: vmovaps %xmm0, %xmm1
+; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX512F-LABEL: test_fmaximumnum_nnan:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: testl %eax, %eax
+; AVX512F-NEXT: sets %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps %xmm2, %xmm1
+; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_fmaximumnum_nnan:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
+; AVX512DQ-NEXT: kmovw %k0, %k1
+; AVX512DQ-NEXT: vmovaps %xmm2, %xmm1
+; AVX512DQ-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512DQ-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nnan:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX10_2-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT: vminmaxss $17, %xmm0, %xmm2
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_nnan:
+; X86: # %bb.0:
+; X86-NEXT: pushl %eax
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vaddss %xmm0, %xmm2, %xmm1
+; X86-NEXT: vsubss %xmm0, %xmm2, %xmm0
+; X86-NEXT: vmovd %xmm1, %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: js .LBB4_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: vmovaps %xmm1, %xmm2
+; X86-NEXT: jmp .LBB4_3
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: vmovaps %xmm0, %xmm2
+; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: .LBB4_3:
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: flds (%esp)
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+ %1 = fadd nnan float %x, %y
+ %2 = fsub nnan float %x, %y
+ %3 = tail call float @llvm.maximumnum.f32(float %1, float %2)
+ ret float %3
+}
+
+define double @test_fmaximumnum_zero0(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero0:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: cmpordsd %xmm1, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm1, %xmm2
+; SSE2-NEXT: xorpd %xmm3, %xmm3
+; SSE2-NEXT: maxsd %xmm3, %xmm1
+; SSE2-NEXT: andnpd %xmm1, %xmm0
+; SSE2-NEXT: orpd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: test_fmaximumnum_zero0:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vcmpordsd %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: test_fmaximumnum_zero0:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vcmpordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero0:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT: vminmaxsd $17, %xmm0, %xmm1
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_zero0:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovlpd %xmm0, (%esp)
+; X86-NEXT: fldl (%esp)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %1 = tail call double @llvm.maximumnum.f64(double 0.0, double %y)
+ ret double %1
+}
+
+define double @test_fmaximumnum_zero1(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero1:
+; SSE2: ...
[truncated]
|
@llvm/pr-subscribers-llvm-selectiondag Author: Phoebe Wang (phoebewang) ChangesPatch is 103.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121464.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index db21e708970648..de1bec1c130d20 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -402,6 +402,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
case ISD::FCOPYSIGN:
case ISD::FSQRT:
case ISD::FSIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 107454a92e356c..780eba16c9c498 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -149,6 +149,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
case ISD::FLDEXP:
case ISD::ABDS:
case ISD::ABDU:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a0514e93d6598b..219674b1e84131 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -623,6 +623,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXNUM, VT, Action);
setOperationAction(ISD::FMINIMUM, VT, Action);
setOperationAction(ISD::FMAXIMUM, VT, Action);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Action);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
setOperationAction(ISD::FSIN, VT, Action);
setOperationAction(ISD::FCOS, VT, Action);
setOperationAction(ISD::FSINCOS, VT, Action);
@@ -1066,6 +1068,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
@@ -1108,6 +1112,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
}
for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
@@ -1473,6 +1479,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Custom);
}
@@ -1818,6 +1826,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FMA, VT, Legal);
@@ -2289,6 +2299,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
@@ -2336,6 +2348,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
}
if (Subtarget.hasVLX()) {
@@ -2383,9 +2397,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
}
}
@@ -2444,6 +2462,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMAXIMUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
}
if (Subtarget.hasAVX10_2_512()) {
setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
@@ -2455,6 +2475,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
}
for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
setCondCodeAction(ISD::SETOEQ, VT, Custom);
@@ -28839,13 +28861,15 @@ static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
- "Expected FMAXIMUM or FMINIMUM opcode");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = Op.getValueType();
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
SDLoc DL(Op);
+ bool IsMaxOp =
+ Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
+ bool IsNum =
+ Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
unsigned Opc = 0;
if (VT.isVector())
@@ -28855,7 +28879,7 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
if (Opc) {
SDValue Imm =
- DAG.getTargetConstant(Op.getOpcode() == ISD::FMAXIMUM, DL, MVT::i32);
+ DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
}
}
@@ -28865,7 +28889,7 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
APInt OppositeZero = PreferredZero;
EVT IVT = VT.changeTypeToInteger();
X86ISD::NodeType MinMaxOp;
- if (Op.getOpcode() == ISD::FMAXIMUM) {
+ if (IsMaxOp) {
MinMaxOp = X86ISD::FMAX;
OppositeZero.setSignBit();
} else {
@@ -28995,7 +29019,9 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
return MinMax;
- SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
+ SDValue IsNaN =
+ DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+
return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
}
@@ -33253,6 +33279,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
+ case ISD::FMINIMUMNUM:
+ case ISD::FMAXIMUMNUM:
return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
case ISD::ABDS:
@@ -45994,6 +46022,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
case ISD::FMAXNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
+ case ISD::FMAXIMUMNUM:
+ case ISD::FMINIMUMNUM:
case X86ISD::FMAX:
case X86ISD::FMIN:
case ISD::FABS: // Begin 1 operand
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
new file mode 100644
index 00000000000000..b183994280a474
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -0,0 +1,2553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86
+
+declare float @llvm.maximumnum.f32(float, float)
+declare double @llvm.maximumnum.f64(double, double)
+declare float @llvm.minimumnum.f32(float, float)
+declare double @llvm.minimumnum.f64(double, double)
+declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>)
+declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+
+;
+; fmaximumnum
+;
+
+define float @test_fmaximumnum(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: js .LBB0_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: .LBB0_2:
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: cmpordss %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: js .LBB0_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: .LBB0_4:
+; SSE2-NEXT: maxss %xmm1, %xmm3
+; SSE2-NEXT: andnps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: test_fmaximumnum:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: testl %eax, %eax
+; AVX1-NEXT: js .LBB0_1
+; AVX1-NEXT: # %bb.2:
+; AVX1-NEXT: vmovdqa %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB0_3
+; AVX1-NEXT: .LBB0_1:
+; AVX1-NEXT: vmovdqa %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa %xmm0, %xmm1
+; AVX1-NEXT: .LBB0_3:
+; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: test_fmaximumnum:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: testl %eax, %eax
+; AVX512-NEXT: sets %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovdqa %xmm0, %xmm2
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum:
+; X86: # %bb.0:
+; X86-NEXT: pushl %eax
+; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vmovd %xmm2, %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: js .LBB0_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: vmovdqa %xmm2, %xmm1
+; X86-NEXT: jmp .LBB0_3
+; X86-NEXT: .LBB0_1:
+; X86-NEXT: vmovdqa %xmm0, %xmm1
+; X86-NEXT: vmovdqa %xmm2, %xmm0
+; X86-NEXT: .LBB0_3:
+; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: flds (%esp)
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+ %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
+ ret float %1
+}
+
+define <4 x float> @test_fmaximumnum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fmaximumnum_scalarize:
+; SSE2: # %bb.0:
+; SSE2-NEXT: maxps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_fmaximumnum_scalarize:
+; AVX: # %bb.0:
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_scalarize:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_scalarize:
+; X86: # %bb.0:
+; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; X86-NEXT: retl
+ %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+ ret <4 x float> %r
+}
+
+define float @test_fmaximumnum_nan0(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan0:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_fmaximumnum_nan0:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan0:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_nan0:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: retl
+ %1 = tail call float @llvm.maximumnum.f32(float 0x7fff000000000000, float %y)
+ ret float %1
+}
+
+define float @test_fmaximumnum_nan1(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_fmaximumnum_nan1:
+; AVX: # %bb.0:
+; AVX-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan1:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_nan1:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: retl
+ %1 = tail call float @llvm.maximumnum.f32(float %x, float 0x7fff000000000000)
+ ret float %1
+}
+
+define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_nnan:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: addss %xmm1, %xmm2
+; SSE2-NEXT: subss %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: js .LBB4_1
+; SSE2-NEXT: # %bb.2:
+; SSE2-NEXT: maxss %xmm2, %xmm0
+; SSE2-NEXT: retq
+; SSE2-NEXT: .LBB4_1:
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: maxss %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: test_fmaximumnum_nnan:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: testl %eax, %eax
+; AVX1-NEXT: js .LBB4_1
+; AVX1-NEXT: # %bb.2:
+; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB4_1:
+; AVX1-NEXT: vmovaps %xmm0, %xmm1
+; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX512F-LABEL: test_fmaximumnum_nnan:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: testl %eax, %eax
+; AVX512F-NEXT: sets %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovaps %xmm2, %xmm1
+; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_fmaximumnum_nnan:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
+; AVX512DQ-NEXT: kmovw %k0, %k1
+; AVX512DQ-NEXT: vmovaps %xmm2, %xmm1
+; AVX512DQ-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512DQ-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nnan:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX10_2-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT: vminmaxss $17, %xmm0, %xmm2
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_nnan:
+; X86: # %bb.0:
+; X86-NEXT: pushl %eax
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vaddss %xmm0, %xmm2, %xmm1
+; X86-NEXT: vsubss %xmm0, %xmm2, %xmm0
+; X86-NEXT: vmovd %xmm1, %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: js .LBB4_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: vmovaps %xmm1, %xmm2
+; X86-NEXT: jmp .LBB4_3
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: vmovaps %xmm0, %xmm2
+; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: .LBB4_3:
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0
+; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: flds (%esp)
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+ %1 = fadd nnan float %x, %y
+ %2 = fsub nnan float %x, %y
+ %3 = tail call float @llvm.maximumnum.f32(float %1, float %2)
+ ret float %3
+}
+
+define double @test_fmaximumnum_zero0(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero0:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: cmpordsd %xmm1, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm1, %xmm2
+; SSE2-NEXT: xorpd %xmm3, %xmm3
+; SSE2-NEXT: maxsd %xmm3, %xmm1
+; SSE2-NEXT: andnpd %xmm1, %xmm0
+; SSE2-NEXT: orpd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: test_fmaximumnum_zero0:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vcmpordsd %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: test_fmaximumnum_zero0:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vcmpordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero0:
+; AVX10_2: # %bb.0:
+; AVX10_2-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT: vminmaxsd $17, %xmm0, %xmm1
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_zero0:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovlpd %xmm0, (%esp)
+; X86-NEXT: fldl (%esp)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %1 = tail call double @llvm.maximumnum.f64(double 0.0, double %y)
+ ret double %1
+}
+
+define double @test_fmaximumnum_zero1(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero1:
+; SSE2: ...
[truncated]
|
@arsenm I saw 3 failures in check-all, could you please take a look? Thanks!
|
You can test this locally with the following command:git-clang-format --diff 096551537b2a747a3387726ca618ceeb3950e9bc 204186cadc67b9df6b79cab6ab71d012a8416584 --extensions cpp -- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp llvm/lib/Target/X86/X86ISelLowering.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 219674b1e8..c981fadd9f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1068,8 +1068,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
- setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
- setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
@@ -1479,8 +1479,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
- setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
- setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Custom);
}
@@ -2299,8 +2299,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
- setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
- setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
|
What happens to the codegen if you regenerate the checks? |
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 | ||
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 | ||
; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 | ||
; GFX9-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is worse, the expansion is now fully scalarized. Previously it decomposed into the legal vector ops
; AARCH64-NEXT: fmaxnm v0.2d, v0.2d, v1.2d | ||
; AARCH64-NEXT: mov d2, v1.d[1] | ||
; AARCH64-NEXT: mov d3, v0.d[1] | ||
; AARCH64-NEXT: fmaxnm d0, d0, d1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks similar to the AMDGPU regression, this is now full scalarization
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm No diff on AArch64 now, please take a second look at changes on AMDGPU. Thanks!
@@ -1081,6 +1083,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { | |||
case ISD::FMAXIMUM: | |||
Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)); | |||
return; | |||
case ISD::FMINIMUMNUM: | |||
case ISD::FMAXIMUMNUM: | |||
if (SDValue Expanded = TLI.expandFMINIMUMNUM_FMAXIMUMNUM(Node, DAG)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
expandFMINIMUMNUM_FMAXIMUMNUM cannot fail, it's unnecessary to check this here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
@phoebewang can you have a check of this patch about #123263 |
Sure. I don't think #123263 is a real problem. |
No description provided.