-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LegalizeVectorOps][RISCV] Use VP_FP_EXTEND/ROUND when promoting VP_FP* operations. #122784
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag Author: Craig Topper (topperc) ChangesThis preserves the original VL leading to more reuse of VL for vsetvli. The VLOptimizer can also clean up a lot of this, but I'm not sure if it gets all of it. There are some regressions in here from propagating the mask too, but I'm not sure if that's a concern. Patch is 1.50 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122784.diff 33 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 89a00c5a4f0439..a6d1b1cb7b104e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -737,7 +737,17 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
.getVectorElementType()
.isFloatingPoint() &&
NVT.isVector() && NVT.getVectorElementType().isFloatingPoint())
- Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(j));
+ if (ISD::isVPOpcode(Node->getOpcode())) {
+ unsigned EVLIdx =
+ *ISD::getVPExplicitVectorLengthIdx(Node->getOpcode());
+ unsigned MaskIdx = *ISD::getVPMaskIdx(Node->getOpcode());
+ Operands[j] =
+ DAG.getNode(ISD::VP_FP_EXTEND, dl, NVT, Node->getOperand(j),
+ Node->getOperand(MaskIdx), Node->getOperand(EVLIdx));
+ } else {
+ Operands[j] =
+ DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(j));
+ }
else
Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(j));
else
@@ -750,8 +760,15 @@ void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) ||
(VT.isVector() && VT.getVectorElementType().isFloatingPoint() &&
NVT.isVector() && NVT.getVectorElementType().isFloatingPoint()))
- Res = DAG.getNode(ISD::FP_ROUND, dl, VT, Res,
- DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+ if (ISD::isVPOpcode(Node->getOpcode())) {
+ unsigned EVLIdx = *ISD::getVPExplicitVectorLengthIdx(Node->getOpcode());
+ unsigned MaskIdx = *ISD::getVPMaskIdx(Node->getOpcode());
+ Res = DAG.getNode(ISD::VP_FP_ROUND, dl, VT, Res,
+ Node->getOperand(MaskIdx), Node->getOperand(EVLIdx));
+ } else {
+ Res = DAG.getNode(ISD::FP_ROUND, dl, VT, Res,
+ DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+ }
else
Res = DAG.getNode(ISD::BITCAST, dl, VT, Res);
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index a81f0ac982f569..1b9c78a20ec3b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -17,23 +17,27 @@ declare <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat>, <vsc
define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv1bf16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT: vfabs.v v8, v9, v0.t
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vfabs.v v11, v10, v0.t
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: vmflt.vf v8, v11, fa5, v0.t
; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.x.f.v v11, v10, v0.t
; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v11, v11, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
-; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
; CHECK-NEXT: ret
%v = call <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x bfloat> %v
@@ -42,12 +46,12 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vs
define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv1bf16_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vfabs.v v8, v9
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vmflt.vf v0, v8, fa5
; CHECK-NEXT: fsrmi a0, 3
; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
@@ -55,7 +59,7 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat>
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
; CHECK-NEXT: ret
%v = call <vscale x 1 x bfloat> @llvm.vp.ceil.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -67,23 +71,27 @@ declare <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat>, <vsc
define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv2bf16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vfabs.v v8, v9, v0.t
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vfabs.v v11, v10, v0.t
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT: vmflt.vf v0, v8, fa5, v0.t
+; CHECK-NEXT: vmflt.vf v8, v11, fa5, v0.t
; CHECK-NEXT: fsrmi a0, 3
+; CHECK-NEXT: vmv.v.v v0, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
+; CHECK-NEXT: vfcvt.x.f.v v11, v10, v0.t
; CHECK-NEXT: fsrm a0
-; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v11, v11, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
-; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT: vfsgnj.vv v10, v11, v10, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
; CHECK-NEXT: ret
%v = call <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> %m, i32 %evl)
ret <vscale x 2 x bfloat> %v
@@ -92,12 +100,12 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv2bf16_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vfabs.v v8, v9
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vmflt.vf v0, v8, fa5
; CHECK-NEXT: fsrmi a0, 3
; CHECK-NEXT: vfcvt.x.f.v v8, v9, v0.t
@@ -105,7 +113,7 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat>
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
; CHECK-NEXT: vfsgnj.vv v9, v8, v9, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9
; CHECK-NEXT: ret
%v = call <vscale x 2 x bfloat> @llvm.vp.ceil.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -117,25 +125,27 @@ declare <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat>, <vsc
define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv4bf16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vfabs.v v12, v10, v0.t
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT: vmflt.vf v9, v12, fa5, v0.t
+; CHECK-NEXT: vmflt.vf v8, v12, fa5, v0.t
; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vfcvt.x.f.v v12, v10, v0.t
; CHECK-NEXT: fsrm a0
; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vfsgnj.vv v10, v12, v10, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
; CHECK-NEXT: ret
%v = call <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> %m, i32 %evl)
ret <vscale x 4 x bfloat> %v
@@ -144,12 +154,12 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vs
define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv4bf16_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vfabs.v v8, v10
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vmflt.vf v0, v8, fa5
; CHECK-NEXT: fsrmi a0, 3
; CHECK-NEXT: vfcvt.x.f.v v8, v10, v0.t
@@ -157,7 +167,7 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat>
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
; CHECK-NEXT: vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10
; CHECK-NEXT: ret
%v = call <vscale x 4 x bfloat> @llvm.vp.ceil.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -169,25 +179,27 @@ declare <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat>, <vsc
define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv8bf16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-NEXT: vmv1r.v v10, v0
-; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vfabs.v v16, v12, v0.t
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT: vmflt.vf v10, v16, fa5, v0.t
+; CHECK-NEXT: vmflt.vf v8, v16, fa5, v0.t
; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vfcvt.x.f.v v16, v12, v0.t
; CHECK-NEXT: fsrm a0
; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
; CHECK-NEXT: vfsgnj.vv v12, v16, v12, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t
; CHECK-NEXT: ret
%v = call <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> %m, i32 %evl)
ret <vscale x 8 x bfloat> %v
@@ -196,12 +208,12 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vs
define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv8bf16_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vfabs.v v8, v12
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vmflt.vf v0, v8, fa5
; CHECK-NEXT: fsrmi a0, 3
; CHECK-NEXT: vfcvt.x.f.v v8, v12, v0.t
@@ -209,7 +221,7 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat>
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
; CHECK-NEXT: vfsgnj.vv v12, v8, v12, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12
; CHECK-NEXT: ret
%v = call <vscale x 8 x bfloat> @llvm.vp.ceil.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -221,25 +233,27 @@ declare <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat>, <
define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv16bf16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-NEXT: vmv1r.v v12, v0
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v24, v16, v0.t
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT: vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT: vmflt.vf v8, v24, fa5, v0.t
; CHECK-NEXT: fsrmi a0, 3
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t
; CHECK-NEXT: fsrm a0
; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16, v0.t
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> %m, i32 %evl)
ret <vscale x 16 x bfloat> %v
@@ -248,12 +262,12 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va,
define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, i32 zeroext %evl) {
; CHECK-LABEL: vp_ceil_vv_nxv16bf16_unmasked:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: lui a1, 307200
-; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: lui a0, 307200
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vfabs.v v8, v16
-; CHECK-NEXT: fmv.w.x fa5, a1
+; CHECK-NEXT: fmv.w.x fa5, a0
; CHECK-NEXT: vmflt.vf v0, v8, fa5
; CHECK-NEXT: fsrmi a0, 3
; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t
@@ -261,7 +275,7 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16_unmasked(<vscale x 16 x bflo
; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
; CHECK-NEXT: ret
%v = call <vscale x 16 x bfloat> @llvm.vp.ceil.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -279,59 +293,64 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmv1r.v v7, v0
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: lui a3, 307200
; CHECK-NEXT: slli a1, a2, 1
; CHECK-NEXT: srli a2, a2, 2
; CHECK-NEXT: fmv.w.x fa5, a3
; CHECK-NEXT: sub a3, a0, a1
-; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v12, v0, a2
+; CHECK-NEXT: vslidedown.vx v17, v0, a2
; CHECK-NEXT: sltu a2, a0, a3
+; CHECK-NEXT: vmv1r.v v18, v17
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a3
-; CHECK-NEXT: vmv1r.v v0, v12
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vfabs.v v16, v24, v0.t
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vfabs.v v8, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT: vmflt.vf v18, v8, fa5, v0.t
; CHECK-NEXT: fsrmi a2, 3
-; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmv1r.v v0, v18
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT: addi a3, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: vfcvt.x.f.v v8, v24, v0.t
; CHECK-NEXT: fsrm a2
-; CHECK-NEXT: addi a2, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t
-; C...
[truncated]
|
8e1f6c3
to
36968c5
Compare
…P* operations. This preserves the original VL leading to more reuse of VL for vsetvli. The VLOptimizer can also clean up a lot of this, but I'm not sure if it gets all of it. There are some regressions in here from propagating the mask too, but I'm not sure if that's a concern.
36968c5
to
63a7d93
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Have you looked into why vlopt isn't getting some of these cases? On first glance, it feels like it should.
For the fixed vector tests, I think its because we can't tell that the register VL from the argument is smaller than the immediate we got from the vector type. For FMA, its because we can't propagate VL from the passthru operand of the FMA. That's the only ones I've looked at. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/95/builds/8339 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/193/builds/4825 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/54/builds/5494 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/195/builds/3426 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/81/builds/3613 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/40/builds/3694 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/72/builds/7149 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/77/builds/7711 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/53/builds/10701 Here is the relevant piece of the build log for the reference
|
This preserves the original VL leading to more reuse of VL for vsetvli. The VLOptimizer can also clean up a lot of this, but I'm not sure if it gets all of it.
There are some regressions in here from propagating the mask too, but I'm not sure if that's a concern.