-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Use vnclip(u) to handle fp_to_(s/u)int_sat that needs additional narrowing. #100071
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…narrowing. If vncvt doesn't produce the destination type directly, use vnclip to do additional narrowing with saturation.
@llvm/pr-subscribers-backend-risc-v Author: Craig Topper (topperc) ChangesIf vncvt doesn't produce the destination type directly, use vnclip to do additional narrowing with saturation. Patch is 29.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100071.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 37b1131d2f8a3..89dbef1bbee50 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3752,6 +3752,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
}
break;
case SPF_FMAXNUM:
+ dbgs() << "ctopper " << SPR.NaNBehavior << "\n";
switch (SPR.NaNBehavior) {
case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
case SPNB_RETURNS_NAN: break;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 823fb428472ef..66c28dafa2abf 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2955,10 +2955,6 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
if (SatVT != DstEltVT)
return SDValue();
- // FIXME: Don't support narrowing by more than 1 steps for now.
- if (SrcEltSize > (2 * DstEltSize))
- return SDValue();
-
MVT DstContainerVT = DstVT;
MVT SrcContainerVT = SrcVT;
if (DstVT.isFixedLengthVector()) {
@@ -2986,9 +2982,29 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
Src = DAG.getNode(RISCVISD::FP_EXTEND_VL, DL, InterVT, Src, Mask, VL);
}
+ MVT CvtContainerVT = DstContainerVT;
+ MVT CvtEltVT = DstEltVT;
+ if (SrcEltSize > (2 * DstEltSize)) {
+ CvtEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2);
+ CvtContainerVT = CvtContainerVT.changeVectorElementType(CvtEltVT);
+ }
+
unsigned RVVOpc =
IsSigned ? RISCVISD::VFCVT_RTZ_X_F_VL : RISCVISD::VFCVT_RTZ_XU_F_VL;
- SDValue Res = DAG.getNode(RVVOpc, DL, DstContainerVT, Src, Mask, VL);
+ SDValue Res = DAG.getNode(RVVOpc, DL, CvtContainerVT, Src, Mask, VL);
+
+ while (CvtContainerVT != DstContainerVT) {
+ CvtEltVT = MVT::getIntegerVT(CvtEltVT.getSizeInBits() / 2);
+ CvtContainerVT = CvtContainerVT.changeVectorElementType(CvtEltVT);
+ // Rounding mode here is arbitrary since we aren't shifting out any bits.
+ unsigned ClipOpc = IsSigned ? RISCVISD::VNCLIP_VL : RISCVISD::VNCLIPU_VL;
+ Res = DAG.getNode(
+ ClipOpc, DL, CvtContainerVT,
+ {Res, DAG.getConstant(0, DL, CvtContainerVT),
+ DAG.getUNDEF(CvtContainerVT), Mask,
+ DAG.getTargetConstant(RISCVVXRndMode::RNU, DL, Subtarget.getXLenVT()),
+ VL});
+ }
SDValue SplatZero = DAG.getNode(
RISCVISD::VMV_V_X_VL, DL, DstContainerVT, DAG.getUNDEF(DstContainerVT),
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index bc46e7d264bc0..d92dc3edecb0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -187,63 +187,20 @@ define void @fp2ui_v2f16_v2i64(ptr %x, ptr %y) {
declare <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half>)
define void @fp2si_v2f64_v2i8(ptr %x, ptr %y) {
-; RV32-LABEL: fp2si_v2f64_v2i8:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vslidedown.vi v9, v8, 1
-; RV32-NEXT: vfmv.f.s fa5, v9
-; RV32-NEXT: lui a0, %hi(.LCPI10_0)
-; RV32-NEXT: fld fa4, %lo(.LCPI10_0)(a0)
-; RV32-NEXT: lui a0, %hi(.LCPI10_1)
-; RV32-NEXT: fld fa3, %lo(.LCPI10_1)(a0)
-; RV32-NEXT: feq.d a0, fa5, fa5
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: fmax.d fa5, fa5, fa4
-; RV32-NEXT: fmin.d fa5, fa5, fa3
-; RV32-NEXT: fcvt.w.d a2, fa5, rtz
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: feq.d a2, fa5, fa5
-; RV32-NEXT: neg a2, a2
-; RV32-NEXT: fmax.d fa5, fa5, fa4
-; RV32-NEXT: fmin.d fa5, fa5, fa3
-; RV32-NEXT: fcvt.w.d a3, fa5, rtz
-; RV32-NEXT: and a2, a2, a3
-; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vse8.v v8, (a1)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: fp2si_v2f64_v2i8:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vfmv.f.s fa5, v9
-; RV64-NEXT: lui a0, %hi(.LCPI10_0)
-; RV64-NEXT: fld fa4, %lo(.LCPI10_0)(a0)
-; RV64-NEXT: lui a0, %hi(.LCPI10_1)
-; RV64-NEXT: fld fa3, %lo(.LCPI10_1)(a0)
-; RV64-NEXT: feq.d a0, fa5, fa5
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: fmax.d fa5, fa5, fa4
-; RV64-NEXT: fmin.d fa5, fa5, fa3
-; RV64-NEXT: fcvt.l.d a2, fa5, rtz
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vfmv.f.s fa5, v8
-; RV64-NEXT: feq.d a2, fa5, fa5
-; RV64-NEXT: negw a2, a2
-; RV64-NEXT: fmax.d fa5, fa5, fa4
-; RV64-NEXT: fmin.d fa5, fa5, fa3
-; RV64-NEXT: fcvt.l.d a3, fa5, rtz
-; RV64-NEXT: and a2, a2, a3
-; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV64-NEXT: vmv.v.x v8, a2
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: vse8.v v8, (a1)
-; RV64-NEXT: ret
+; CHECK-LABEL: fp2si_v2f64_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vmfne.vv v0, v8, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnclip.wi v8, v9, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vnclip.wi v8, v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: vse8.v v8, (a1)
+; CHECK-NEXT: ret
%a = load <2 x double>, ptr %x
%d = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> %a)
store <2 x i8> %d, ptr %y
@@ -252,49 +209,20 @@ define void @fp2si_v2f64_v2i8(ptr %x, ptr %y) {
declare <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double>)
define void @fp2ui_v2f64_v2i8(ptr %x, ptr %y) {
-; RV32-LABEL: fp2ui_v2f64_v2i8:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vslidedown.vi v9, v8, 1
-; RV32-NEXT: lui a0, %hi(.LCPI11_0)
-; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
-; RV32-NEXT: vfmv.f.s fa4, v9
-; RV32-NEXT: fcvt.d.w fa3, zero
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT: vfmv.f.s fa4, v8
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa5, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a2, fa5, rtz
-; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vse8.v v8, (a1)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: fp2ui_v2f64_v2i8:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: lui a0, %hi(.LCPI11_0)
-; RV64-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
-; RV64-NEXT: vfmv.f.s fa4, v9
-; RV64-NEXT: fmv.d.x fa3, zero
-; RV64-NEXT: fmax.d fa4, fa4, fa3
-; RV64-NEXT: fmin.d fa4, fa4, fa5
-; RV64-NEXT: fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT: vfmv.f.s fa4, v8
-; RV64-NEXT: fmax.d fa4, fa4, fa3
-; RV64-NEXT: fmin.d fa5, fa4, fa5
-; RV64-NEXT: fcvt.lu.d a2, fa5, rtz
-; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV64-NEXT: vmv.v.x v8, a2
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: vse8.v v8, (a1)
-; RV64-NEXT: ret
+; CHECK-LABEL: fp2ui_v2f64_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vmfne.vv v0, v8, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v9, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: vse8.v v8, (a1)
+; CHECK-NEXT: ret
%a = load <2 x double>, ptr %x
%d = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> %a)
store <2 x i8> %d, ptr %y
@@ -304,203 +232,20 @@ declare <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double>)
define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
;
-; RV32-LABEL: fp2si_v8f64_v8i8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: vfmv.f.s fa3, v10
-; RV32-NEXT: lui a0, %hi(.LCPI12_0)
-; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
-; RV32-NEXT: lui a0, %hi(.LCPI12_1)
-; RV32-NEXT: fld fa4, %lo(.LCPI12_1)(a0)
-; RV32-NEXT: feq.d a0, fa3, fa3
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a2, fa3, rtz
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vfmv.f.s fa3, v8
-; RV32-NEXT: feq.d a2, fa3, fa3
-; RV32-NEXT: neg a2, a2
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a3, fa3, rtz
-; RV32-NEXT: and a2, a2, a3
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a2
-; RV32-NEXT: vslide1down.vx v10, v10, a0
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 2
-; RV32-NEXT: vfmv.f.s fa3, v12
-; RV32-NEXT: feq.d a0, fa3, fa3
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a2, fa3, rtz
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vfmv.f.s fa3, v8
-; RV32-NEXT: feq.d a2, fa3, fa3
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a3, fa3, rtz
-; RV32-NEXT: fld fa3, 40(sp)
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vslide1down.vx v8, v10, a0
-; RV32-NEXT: neg a0, a2
-; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: feq.d a2, fa3, fa3
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a3, fa3, rtz
-; RV32-NEXT: fld fa3, 32(sp)
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: neg a0, a2
-; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: feq.d a2, fa3, fa3
-; RV32-NEXT: neg a2, a2
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a3, fa3, rtz
-; RV32-NEXT: fld fa3, 48(sp)
-; RV32-NEXT: and a2, a2, a3
-; RV32-NEXT: vmv.v.x v9, a2
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: feq.d a0, fa3, fa3
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a2, fa3, rtz
-; RV32-NEXT: fld fa3, 56(sp)
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: feq.d a0, fa3, fa3
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: fmax.d fa5, fa3, fa5
-; RV32-NEXT: fmin.d fa5, fa5, fa4
-; RV32-NEXT: fcvt.w.d a2, fa5, rtz
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vmv.v.i v0, 15
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; RV32-NEXT: vse8.v v9, (a1)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
-; RV32-NEXT: ret
-;
-; RV64-LABEL: fp2si_v8f64_v8i8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: mv a0, sp
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: vfmv.f.s fa3, v10
-; RV64-NEXT: lui a0, %hi(.LCPI12_0)
-; RV64-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
-; RV64-NEXT: lui a0, %hi(.LCPI12_1)
-; RV64-NEXT: fld fa4, %lo(.LCPI12_1)(a0)
-; RV64-NEXT: feq.d a0, fa3, fa3
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a2, fa3, rtz
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vfmv.f.s fa3, v8
-; RV64-NEXT: feq.d a2, fa3, fa3
-; RV64-NEXT: negw a2, a2
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a3, fa3, rtz
-; RV64-NEXT: and a2, a2, a3
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT: vmv.v.x v10, a2
-; RV64-NEXT: vslide1down.vx v10, v10, a0
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 2
-; RV64-NEXT: vfmv.f.s fa3, v12
-; RV64-NEXT: feq.d a0, fa3, fa3
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a2, fa3, rtz
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vfmv.f.s fa3, v8
-; RV64-NEXT: feq.d a2, fa3, fa3
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a3, fa3, rtz
-; RV64-NEXT: fld fa3, 40(sp)
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vslide1down.vx v8, v10, a0
-; RV64-NEXT: neg a0, a2
-; RV64-NEXT: and a0, a0, a3
-; RV64-NEXT: feq.d a2, fa3, fa3
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a3, fa3, rtz
-; RV64-NEXT: fld fa3, 32(sp)
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: neg a0, a2
-; RV64-NEXT: and a0, a0, a3
-; RV64-NEXT: feq.d a2, fa3, fa3
-; RV64-NEXT: negw a2, a2
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a3, fa3, rtz
-; RV64-NEXT: fld fa3, 48(sp)
-; RV64-NEXT: and a2, a2, a3
-; RV64-NEXT: vmv.v.x v9, a2
-; RV64-NEXT: vslide1down.vx v9, v9, a0
-; RV64-NEXT: feq.d a0, fa3, fa3
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a2, fa3, rtz
-; RV64-NEXT: fld fa3, 56(sp)
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vslide1down.vx v9, v9, a0
-; RV64-NEXT: feq.d a0, fa3, fa3
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: fmax.d fa5, fa3, fa5
-; RV64-NEXT: fmin.d fa5, fa5, fa4
-; RV64-NEXT: fcvt.l.d a2, fa5, rtz
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vmv.v.i v0, 15
-; RV64-NEXT: vslide1down.vx v9, v9, a0
-; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; RV64-NEXT: vse8.v v9, (a1)
-; RV64-NEXT: addi sp, s0, -128
-; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 128
-; RV64-NEXT: ret
+; CHECK-LABEL: fp2si_v8f64_v8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vmfne.vv v0, v8, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vnclip.wi v8, v12, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vnclip.wi v8, v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: vse8.v v8, (a1)
+; CHECK-NEXT: ret
%a = load <8 x double>, ptr %x
%d = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> %a)
store <8 x i8> %d, ptr %y
@@ -510,151 +255,20 @@ declare <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double>)
define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
;
-; RV32-LABEL: fp2ui_v8f64_v8i8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: lui a0, %hi(.LCPI13_0)
-; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
-; RV32-NEXT: vfmv.f.s fa4, v10
-; RV32-NEXT: fcvt.d.w fa3, zero
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT: vfmv.f.s fa4, v8
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a2, fa4, rtz
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 2
-; RV32-NEXT: vfmv.f.s fa4, v10
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a3, fa4, rtz
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vfmv.f.s fa4, v8
-; RV32-NEXT: fld fa2, 40(sp)
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a4, fa4, rtz
-; RV32-NEXT: fmax.d fa4, fa2, fa3
-; RV32-NEXT: fld fa2, 32(sp)
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a5, fa4, rtz
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: fmax.d fa4, fa2, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a2, fa4, rtz
-; RV32-NEXT: fld fa4, 48(sp)
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a3
-; RV32-NEXT: vslide1down.vx v8, v8, a4
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT: fld fa4, 56(sp)
-; RV32-NEXT: vmv.v.x v9, a2
-; RV32-NEXT: vslide1down.vx v9, v9, a5
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa5, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a0, fa5, rtz
-; RV32-NEXT: vmv.v.i v0, 15
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; RV32-NEXT: vse8.v v9, (a1)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
-; RV32-NEXT: ret
-;
-; RV64-LABEL: fp2ui_v8f64_v8i8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: mv a0, sp
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: lui a0, %hi(.LCPI13_0)
-; RV64-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
-; RV64-NEXT: vfmv.f.s fa4, v10
-; RV64-NEXT: fmv.d.x fa3, zero
-; RV64-NEXT: fmax.d fa4, fa4, fa3
-; RV64-NEXT: fmin.d fa4, fa4, fa5
-; RV64-NEXT: fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT: vfmv.f.s fa4, v8
-; RV64-NEXT: fmax.d fa4, fa4, fa3
-; RV64-NEXT: fmin.d fa4, fa4, fa5
-; RV64-NEXT: fcvt.lu.d a2, fa4, rtz
-; RV64-NEXT: vsetivli zero, 1, e64...
[truncated]
|
@llvm/pr-subscribers-llvm-selectiondag Author: Craig Topper (topperc) ChangesIf vncvt doesn't produce the destination type directly, use vnclip to do additional narrowing with saturation. Patch is 29.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100071.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 37b1131d2f8a3..89dbef1bbee50 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3752,6 +3752,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
}
break;
case SPF_FMAXNUM:
+ dbgs() << "ctopper " << SPR.NaNBehavior << "\n";
switch (SPR.NaNBehavior) {
case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
case SPNB_RETURNS_NAN: break;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 823fb428472ef..66c28dafa2abf 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2955,10 +2955,6 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
if (SatVT != DstEltVT)
return SDValue();
- // FIXME: Don't support narrowing by more than 1 steps for now.
- if (SrcEltSize > (2 * DstEltSize))
- return SDValue();
-
MVT DstContainerVT = DstVT;
MVT SrcContainerVT = SrcVT;
if (DstVT.isFixedLengthVector()) {
@@ -2986,9 +2982,29 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
Src = DAG.getNode(RISCVISD::FP_EXTEND_VL, DL, InterVT, Src, Mask, VL);
}
+ MVT CvtContainerVT = DstContainerVT;
+ MVT CvtEltVT = DstEltVT;
+ if (SrcEltSize > (2 * DstEltSize)) {
+ CvtEltVT = MVT::getIntegerVT(SrcEltVT.getSizeInBits() / 2);
+ CvtContainerVT = CvtContainerVT.changeVectorElementType(CvtEltVT);
+ }
+
unsigned RVVOpc =
IsSigned ? RISCVISD::VFCVT_RTZ_X_F_VL : RISCVISD::VFCVT_RTZ_XU_F_VL;
- SDValue Res = DAG.getNode(RVVOpc, DL, DstContainerVT, Src, Mask, VL);
+ SDValue Res = DAG.getNode(RVVOpc, DL, CvtContainerVT, Src, Mask, VL);
+
+ while (CvtContainerVT != DstContainerVT) {
+ CvtEltVT = MVT::getIntegerVT(CvtEltVT.getSizeInBits() / 2);
+ CvtContainerVT = CvtContainerVT.changeVectorElementType(CvtEltVT);
+ // Rounding mode here is arbitrary since we aren't shifting out any bits.
+ unsigned ClipOpc = IsSigned ? RISCVISD::VNCLIP_VL : RISCVISD::VNCLIPU_VL;
+ Res = DAG.getNode(
+ ClipOpc, DL, CvtContainerVT,
+ {Res, DAG.getConstant(0, DL, CvtContainerVT),
+ DAG.getUNDEF(CvtContainerVT), Mask,
+ DAG.getTargetConstant(RISCVVXRndMode::RNU, DL, Subtarget.getXLenVT()),
+ VL});
+ }
SDValue SplatZero = DAG.getNode(
RISCVISD::VMV_V_X_VL, DL, DstContainerVT, DAG.getUNDEF(DstContainerVT),
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index bc46e7d264bc0..d92dc3edecb0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -187,63 +187,20 @@ define void @fp2ui_v2f16_v2i64(ptr %x, ptr %y) {
declare <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half>)
define void @fp2si_v2f64_v2i8(ptr %x, ptr %y) {
-; RV32-LABEL: fp2si_v2f64_v2i8:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vslidedown.vi v9, v8, 1
-; RV32-NEXT: vfmv.f.s fa5, v9
-; RV32-NEXT: lui a0, %hi(.LCPI10_0)
-; RV32-NEXT: fld fa4, %lo(.LCPI10_0)(a0)
-; RV32-NEXT: lui a0, %hi(.LCPI10_1)
-; RV32-NEXT: fld fa3, %lo(.LCPI10_1)(a0)
-; RV32-NEXT: feq.d a0, fa5, fa5
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: fmax.d fa5, fa5, fa4
-; RV32-NEXT: fmin.d fa5, fa5, fa3
-; RV32-NEXT: fcvt.w.d a2, fa5, rtz
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: feq.d a2, fa5, fa5
-; RV32-NEXT: neg a2, a2
-; RV32-NEXT: fmax.d fa5, fa5, fa4
-; RV32-NEXT: fmin.d fa5, fa5, fa3
-; RV32-NEXT: fcvt.w.d a3, fa5, rtz
-; RV32-NEXT: and a2, a2, a3
-; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vse8.v v8, (a1)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: fp2si_v2f64_v2i8:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vfmv.f.s fa5, v9
-; RV64-NEXT: lui a0, %hi(.LCPI10_0)
-; RV64-NEXT: fld fa4, %lo(.LCPI10_0)(a0)
-; RV64-NEXT: lui a0, %hi(.LCPI10_1)
-; RV64-NEXT: fld fa3, %lo(.LCPI10_1)(a0)
-; RV64-NEXT: feq.d a0, fa5, fa5
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: fmax.d fa5, fa5, fa4
-; RV64-NEXT: fmin.d fa5, fa5, fa3
-; RV64-NEXT: fcvt.l.d a2, fa5, rtz
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vfmv.f.s fa5, v8
-; RV64-NEXT: feq.d a2, fa5, fa5
-; RV64-NEXT: negw a2, a2
-; RV64-NEXT: fmax.d fa5, fa5, fa4
-; RV64-NEXT: fmin.d fa5, fa5, fa3
-; RV64-NEXT: fcvt.l.d a3, fa5, rtz
-; RV64-NEXT: and a2, a2, a3
-; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV64-NEXT: vmv.v.x v8, a2
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: vse8.v v8, (a1)
-; RV64-NEXT: ret
+; CHECK-LABEL: fp2si_v2f64_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vmfne.vv v0, v8, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnclip.wi v8, v9, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vnclip.wi v8, v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: vse8.v v8, (a1)
+; CHECK-NEXT: ret
%a = load <2 x double>, ptr %x
%d = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> %a)
store <2 x i8> %d, ptr %y
@@ -252,49 +209,20 @@ define void @fp2si_v2f64_v2i8(ptr %x, ptr %y) {
declare <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double>)
define void @fp2ui_v2f64_v2i8(ptr %x, ptr %y) {
-; RV32-LABEL: fp2ui_v2f64_v2i8:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vslidedown.vi v9, v8, 1
-; RV32-NEXT: lui a0, %hi(.LCPI11_0)
-; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
-; RV32-NEXT: vfmv.f.s fa4, v9
-; RV32-NEXT: fcvt.d.w fa3, zero
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT: vfmv.f.s fa4, v8
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa5, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a2, fa5, rtz
-; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vse8.v v8, (a1)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: fp2ui_v2f64_v2i8:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: lui a0, %hi(.LCPI11_0)
-; RV64-NEXT: fld fa5, %lo(.LCPI11_0)(a0)
-; RV64-NEXT: vfmv.f.s fa4, v9
-; RV64-NEXT: fmv.d.x fa3, zero
-; RV64-NEXT: fmax.d fa4, fa4, fa3
-; RV64-NEXT: fmin.d fa4, fa4, fa5
-; RV64-NEXT: fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT: vfmv.f.s fa4, v8
-; RV64-NEXT: fmax.d fa4, fa4, fa3
-; RV64-NEXT: fmin.d fa5, fa4, fa5
-; RV64-NEXT: fcvt.lu.d a2, fa5, rtz
-; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV64-NEXT: vmv.v.x v8, a2
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: vse8.v v8, (a1)
-; RV64-NEXT: ret
+; CHECK-LABEL: fp2ui_v2f64_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vmfne.vv v0, v8, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v9, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vnclipu.wi v8, v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: vse8.v v8, (a1)
+; CHECK-NEXT: ret
%a = load <2 x double>, ptr %x
%d = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> %a)
store <2 x i8> %d, ptr %y
@@ -304,203 +232,20 @@ declare <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double>)
define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
;
-; RV32-LABEL: fp2si_v8f64_v8i8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: vfmv.f.s fa3, v10
-; RV32-NEXT: lui a0, %hi(.LCPI12_0)
-; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
-; RV32-NEXT: lui a0, %hi(.LCPI12_1)
-; RV32-NEXT: fld fa4, %lo(.LCPI12_1)(a0)
-; RV32-NEXT: feq.d a0, fa3, fa3
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a2, fa3, rtz
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vfmv.f.s fa3, v8
-; RV32-NEXT: feq.d a2, fa3, fa3
-; RV32-NEXT: neg a2, a2
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a3, fa3, rtz
-; RV32-NEXT: and a2, a2, a3
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v10, a2
-; RV32-NEXT: vslide1down.vx v10, v10, a0
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v8, 2
-; RV32-NEXT: vfmv.f.s fa3, v12
-; RV32-NEXT: feq.d a0, fa3, fa3
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a2, fa3, rtz
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vfmv.f.s fa3, v8
-; RV32-NEXT: feq.d a2, fa3, fa3
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a3, fa3, rtz
-; RV32-NEXT: fld fa3, 40(sp)
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vslide1down.vx v8, v10, a0
-; RV32-NEXT: neg a0, a2
-; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: feq.d a2, fa3, fa3
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a3, fa3, rtz
-; RV32-NEXT: fld fa3, 32(sp)
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: neg a0, a2
-; RV32-NEXT: and a0, a0, a3
-; RV32-NEXT: feq.d a2, fa3, fa3
-; RV32-NEXT: neg a2, a2
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a3, fa3, rtz
-; RV32-NEXT: fld fa3, 48(sp)
-; RV32-NEXT: and a2, a2, a3
-; RV32-NEXT: vmv.v.x v9, a2
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: feq.d a0, fa3, fa3
-; RV32-NEXT: fmax.d fa3, fa3, fa5
-; RV32-NEXT: fmin.d fa3, fa3, fa4
-; RV32-NEXT: fcvt.w.d a2, fa3, rtz
-; RV32-NEXT: fld fa3, 56(sp)
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: feq.d a0, fa3, fa3
-; RV32-NEXT: neg a0, a0
-; RV32-NEXT: fmax.d fa5, fa3, fa5
-; RV32-NEXT: fmin.d fa5, fa5, fa4
-; RV32-NEXT: fcvt.w.d a2, fa5, rtz
-; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vmv.v.i v0, 15
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; RV32-NEXT: vse8.v v9, (a1)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
-; RV32-NEXT: ret
-;
-; RV64-LABEL: fp2si_v8f64_v8i8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: mv a0, sp
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: vfmv.f.s fa3, v10
-; RV64-NEXT: lui a0, %hi(.LCPI12_0)
-; RV64-NEXT: fld fa5, %lo(.LCPI12_0)(a0)
-; RV64-NEXT: lui a0, %hi(.LCPI12_1)
-; RV64-NEXT: fld fa4, %lo(.LCPI12_1)(a0)
-; RV64-NEXT: feq.d a0, fa3, fa3
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a2, fa3, rtz
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vfmv.f.s fa3, v8
-; RV64-NEXT: feq.d a2, fa3, fa3
-; RV64-NEXT: negw a2, a2
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a3, fa3, rtz
-; RV64-NEXT: and a2, a2, a3
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT: vmv.v.x v10, a2
-; RV64-NEXT: vslide1down.vx v10, v10, a0
-; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v8, 2
-; RV64-NEXT: vfmv.f.s fa3, v12
-; RV64-NEXT: feq.d a0, fa3, fa3
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a2, fa3, rtz
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vslidedown.vi v8, v8, 3
-; RV64-NEXT: vfmv.f.s fa3, v8
-; RV64-NEXT: feq.d a2, fa3, fa3
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a3, fa3, rtz
-; RV64-NEXT: fld fa3, 40(sp)
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT: vslide1down.vx v8, v10, a0
-; RV64-NEXT: neg a0, a2
-; RV64-NEXT: and a0, a0, a3
-; RV64-NEXT: feq.d a2, fa3, fa3
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a3, fa3, rtz
-; RV64-NEXT: fld fa3, 32(sp)
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: neg a0, a2
-; RV64-NEXT: and a0, a0, a3
-; RV64-NEXT: feq.d a2, fa3, fa3
-; RV64-NEXT: negw a2, a2
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a3, fa3, rtz
-; RV64-NEXT: fld fa3, 48(sp)
-; RV64-NEXT: and a2, a2, a3
-; RV64-NEXT: vmv.v.x v9, a2
-; RV64-NEXT: vslide1down.vx v9, v9, a0
-; RV64-NEXT: feq.d a0, fa3, fa3
-; RV64-NEXT: fmax.d fa3, fa3, fa5
-; RV64-NEXT: fmin.d fa3, fa3, fa4
-; RV64-NEXT: fcvt.l.d a2, fa3, rtz
-; RV64-NEXT: fld fa3, 56(sp)
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vslide1down.vx v9, v9, a0
-; RV64-NEXT: feq.d a0, fa3, fa3
-; RV64-NEXT: neg a0, a0
-; RV64-NEXT: fmax.d fa5, fa3, fa5
-; RV64-NEXT: fmin.d fa5, fa5, fa4
-; RV64-NEXT: fcvt.l.d a2, fa5, rtz
-; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vmv.v.i v0, 15
-; RV64-NEXT: vslide1down.vx v9, v9, a0
-; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; RV64-NEXT: vse8.v v9, (a1)
-; RV64-NEXT: addi sp, s0, -128
-; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 128
-; RV64-NEXT: ret
+; CHECK-LABEL: fp2si_v8f64_v8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vmfne.vv v0, v8, v8
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vnclip.wi v8, v12, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vnclip.wi v8, v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 0, v0
+; CHECK-NEXT: vse8.v v8, (a1)
+; CHECK-NEXT: ret
%a = load <8 x double>, ptr %x
%d = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> %a)
store <8 x i8> %d, ptr %y
@@ -510,151 +255,20 @@ declare <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double>)
define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
;
-; RV32-LABEL: fp2ui_v8f64_v8i8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -128
-; RV32-NEXT: .cfi_def_cfa_offset 128
-; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 128
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: lui a0, %hi(.LCPI13_0)
-; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
-; RV32-NEXT: vfmv.f.s fa4, v10
-; RV32-NEXT: fcvt.d.w fa3, zero
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT: vfmv.f.s fa4, v8
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a2, fa4, rtz
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 2
-; RV32-NEXT: vfmv.f.s fa4, v10
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a3, fa4, rtz
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vfmv.f.s fa4, v8
-; RV32-NEXT: fld fa2, 40(sp)
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a4, fa4, rtz
-; RV32-NEXT: fmax.d fa4, fa2, fa3
-; RV32-NEXT: fld fa2, 32(sp)
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a5, fa4, rtz
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT: vmv.v.x v8, a2
-; RV32-NEXT: fmax.d fa4, fa2, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a2, fa4, rtz
-; RV32-NEXT: fld fa4, 48(sp)
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a3
-; RV32-NEXT: vslide1down.vx v8, v8, a4
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa4, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
-; RV32-NEXT: fld fa4, 56(sp)
-; RV32-NEXT: vmv.v.x v9, a2
-; RV32-NEXT: vslide1down.vx v9, v9, a5
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: fmax.d fa4, fa4, fa3
-; RV32-NEXT: fmin.d fa5, fa4, fa5
-; RV32-NEXT: fcvt.wu.d a0, fa5, rtz
-; RV32-NEXT: vmv.v.i v0, 15
-; RV32-NEXT: vslide1down.vx v9, v9, a0
-; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
-; RV32-NEXT: vse8.v v9, (a1)
-; RV32-NEXT: addi sp, s0, -128
-; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 128
-; RV32-NEXT: ret
-;
-; RV64-LABEL: fp2ui_v8f64_v8i8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -128
-; RV64-NEXT: .cfi_def_cfa_offset 128
-; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 128
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: mv a0, sp
-; RV64-NEXT: vse64.v v8, (a0)
-; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 1
-; RV64-NEXT: lui a0, %hi(.LCPI13_0)
-; RV64-NEXT: fld fa5, %lo(.LCPI13_0)(a0)
-; RV64-NEXT: vfmv.f.s fa4, v10
-; RV64-NEXT: fmv.d.x fa3, zero
-; RV64-NEXT: fmax.d fa4, fa4, fa3
-; RV64-NEXT: fmin.d fa4, fa4, fa5
-; RV64-NEXT: fcvt.lu.d a0, fa4, rtz
-; RV64-NEXT: vfmv.f.s fa4, v8
-; RV64-NEXT: fmax.d fa4, fa4, fa3
-; RV64-NEXT: fmin.d fa4, fa4, fa5
-; RV64-NEXT: fcvt.lu.d a2, fa4, rtz
-; RV64-NEXT: vsetivli zero, 1, e64...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM asides from that debug line
…nal narrowing. (#100071) Summary: If vncvt doesn't produce the destination type directly, use vnclip to do additional narrowing with saturation. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251157
If vncvt doesn't produce the destination type directly, use vnclip to do additional narrowing with saturation.