-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Add reductions to list of roots in tryToReduceVL #107595
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This allows us to reduce VLs feeding reduction instructions. In particular, this means that <3 x Ty> reduce(load) like sequences no longer require a VL toggle. This was waiting on 3d72957; now that the latent correctness issue is fixed, we can expand this transform.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesThis allows us to reduce VLs feeding reduction instructions. In particular, this means that <3 x Ty> reduce(load) like sequences no longer require a VL toggle. This was waiting on 3d72957; now that the latent correctness issue is fixed, we can expand this transform. Full diff: https://github.com/llvm/llvm-project/pull/107595.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index db8e496493c417..00ffebb914098e 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -144,6 +144,24 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) {
case RISCV::VMERGE_VVM:
SrcIdx = 3; // TODO: We can also handle the false operand.
break;
+ case RISCV::VREDSUM_VS:
+ case RISCV::VREDMAXU_VS:
+ case RISCV::VREDMAX_VS:
+ case RISCV::VREDMINU_VS:
+ case RISCV::VREDMIN_VS:
+ case RISCV::VREDAND_VS:
+ case RISCV::VREDOR_VS:
+ case RISCV::VREDXOR_VS:
+ case RISCV::VWREDSUM_VS:
+ case RISCV::VWREDSUMU_VS:
+ case RISCV::VFREDUSUM_VS:
+ case RISCV::VFREDOSUM_VS:
+ case RISCV::VFREDMAX_VS:
+ case RISCV::VFREDMIN_VS:
+ case RISCV::VFWREDUSUM_VS:
+ case RISCV::VFWREDOSUM_VS:
+ SrcIdx = 2;
+ break;
}
MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
diff --git a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
index 3d367ddc59bca7..5d588ad66b9ca9 100644
--- a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
+++ b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll
@@ -19,7 +19,7 @@ define signext i32 @sum(ptr %a, i32 signext %n, i1 %prof.min.iters.check, <vscal
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_4: # %vector.ph
-; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v8, zero
; CHECK-NEXT: vmv.v.i v12, 0
; CHECK-NEXT: vsetivli zero, 1, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 6e5ab436fc02d0..a8798474d669ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -121,10 +121,9 @@ define i32 @reduce_sum_16xi32_prefix2(ptr %p) {
define i32 @reduce_sum_16xi32_prefix3(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -160,10 +159,9 @@ define i32 @reduce_sum_16xi32_prefix4(ptr %p) {
define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -183,10 +181,9 @@ define i32 @reduce_sum_16xi32_prefix5(ptr %p) {
define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
-; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -208,10 +205,9 @@ define i32 @reduce_sum_16xi32_prefix6(ptr %p) {
define i32 @reduce_sum_16xi32_prefix7(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
-; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -263,10 +259,9 @@ define i32 @reduce_sum_16xi32_prefix8(ptr %p) {
define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v12, zero
-; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -294,10 +289,9 @@ define i32 @reduce_sum_16xi32_prefix9(ptr %p) {
define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix13:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 13, e32, m4, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v12, zero
-; CHECK-NEXT: vsetivli zero, 13, e32, m4, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -334,10 +328,9 @@ define i32 @reduce_sum_16xi32_prefix13(ptr %p) {
define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix14:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 14, e32, m4, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v12, zero
-; CHECK-NEXT: vsetivli zero, 14, e32, m4, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -375,10 +368,9 @@ define i32 @reduce_sum_16xi32_prefix14(ptr %p) {
define i32 @reduce_sum_16xi32_prefix15(ptr %p) {
; CHECK-LABEL: reduce_sum_16xi32_prefix15:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vsetivli zero, 15, e32, m4, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v12, zero
-; CHECK-NEXT: vsetivli zero, 15, e32, m4, ta, ma
; CHECK-NEXT: vredsum.vs v8, v8, v12
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -499,10 +491,9 @@ define i32 @reduce_xor_16xi32_prefix2(ptr %p) {
define i32 @reduce_xor_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_xor_16xi32_prefix5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vredxor.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -537,7 +528,7 @@ define i32 @reduce_and_16xi32_prefix2(ptr %p) {
define i32 @reduce_and_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_and_16xi32_prefix5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vsetivli zero, 5, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.i v10, -1
@@ -576,10 +567,9 @@ define i32 @reduce_or_16xi32_prefix2(ptr %p) {
define i32 @reduce_or_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_or_16xi32_prefix5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vredor.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -619,11 +609,10 @@ define i32 @reduce_smax_16xi32_prefix2(ptr %p) {
define i32 @reduce_smax_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_smax_16xi32_prefix5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vmv.s.x v10, a0
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vredmax.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -658,12 +647,11 @@ define i32 @reduce_smin_16xi32_prefix2(ptr %p) {
define i32 @reduce_smin_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_smin_16xi32_prefix5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vmv.s.x v10, a0
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vredmin.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -698,10 +686,9 @@ define i32 @reduce_umax_16xi32_prefix2(ptr %p) {
define i32 @reduce_umax_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_umax_16xi32_prefix5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vmv.s.x v10, zero
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vredmaxu.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
@@ -736,7 +723,7 @@ define i32 @reduce_umin_16xi32_prefix2(ptr %p) {
define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
; RV32-LABEL: reduce_umin_16xi32_prefix5:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: vsetivli zero, 5, e32, m1, ta, ma
; RV32-NEXT: vmv.v.i v10, -1
@@ -747,11 +734,10 @@ define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
;
; RV64-LABEL: reduce_umin_16xi32_prefix5:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; RV64-NEXT: vle32.v v8, (a0)
; RV64-NEXT: li a0, -1
; RV64-NEXT: vmv.s.x v10, a0
-; RV64-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; RV64-NEXT: vredminu.vs v8, v8, v10
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: ret
@@ -787,11 +773,10 @@ define float @reduce_fadd_16xf32_prefix2(ptr %p) {
define float @reduce_fadd_16xi32_prefix5(ptr %p) {
; CHECK-LABEL: reduce_fadd_16xi32_prefix5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vmv.s.x v10, a0
-; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma
; CHECK-NEXT: vfredusum.vs v8, v8, v10
; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index 7f2e3cdbfd0e3c..7d78fa5a8f3ef2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -22,10 +22,10 @@ define half @vpreduce_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1
;
; ZVFHMIN-LABEL: vpreduce_fadd_nxv1f16:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vfredusum.vs v8, v9, v8, v0.t
@@ -48,10 +48,10 @@ define half @vpreduce_ord_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale
;
; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv1f16:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma
; ZVFHMIN-NEXT: vfredosum.vs v8, v9, v8, v0.t
@@ -76,10 +76,10 @@ define half @vpreduce_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2
;
; ZVFHMIN-LABEL: vpreduce_fadd_nxv2f16:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; ZVFHMIN-NEXT: vfredusum.vs v8, v9, v8, v0.t
@@ -102,10 +102,10 @@ define half @vpreduce_ord_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale
;
; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv2f16:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma
; ZVFHMIN-NEXT: vfredosum.vs v8, v9, v8, v0.t
@@ -130,10 +130,10 @@ define half @vpreduce_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale x 4
;
; ZVFHMIN-LABEL: vpreduce_fadd_nxv4f16:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfredusum.vs v8, v10, v8, v0.t
@@ -156,10 +156,10 @@ define half @vpreduce_ord_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale
;
; ZVFHMIN-LABEL: vpreduce_ord_fadd_nxv4f16:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma
; ZVFHMIN-NEXT: vfredosum.vs v8, v10, v8, v0.t
@@ -233,10 +233,10 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
; ZVFHMIN-NEXT: # %bb.5:
; ZVFHMIN-NEXT: mv a0, a4
; ZVFHMIN-NEXT: .LBB6_6:
-; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfredusum.vs v8, v24, v8, v0.t
@@ -245,20 +245,20 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa5
; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a5, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfredusum.vs v8, v24, v8, v0.t
; ZVFHMIN-NEXT: vfmv.f.s fa5, v8
; ZVFHMIN-NEXT: fcvt.h.s fa5, fa5
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa5
; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfredusum.vs v8, v24, v8, v0.t
; ZVFHMIN-NEXT: vfmv.f.s fa5, v8
; ZVFHMIN-NEXT: fcvt.h.s fa5, fa5
@@ -267,9 +267,9 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfredusum.vs v8, v24, v8, v0.t
; ZVFHMIN-NEXT: vfmv.f.s fa5, v8
; ZVFHMIN-NEXT: fcvt.h.s fa0, fa5
@@ -339,10 +339,10 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
; ZVFHMIN-NEXT: # %bb.5:
; ZVFHMIN-NEXT: mv a0, a4
; ZVFHMIN-NEXT: .LBB7_6:
-; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfredosum.vs v8, v24, v8, v0.t
@@ -351,20 +351,20 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa5
; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a5, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vmv1r.v v0, v6
-; ZVFHMIN-NEXT: vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfredosum.vs v8, v24, v8, v0.t
; ZVFHMIN-NEXT: vfmv.f.s fa5, v8
; ZVFHMIN-NEXT: fcvt.h.s fa5, fa5
; ZVFHMIN-NEXT: fcvt.s.h fa5, fa5
; ZVFHMIN-NEXT: vsetivli zero, 1, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16
; ZVFHMIN-NEXT: vmv1r.v v0, v7
-; ZVFHMIN-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfredosum.vs v8, v24, v8, v0.t
; ZVFHMIN-NEXT: vfmv.f.s fa5, v8
; ZVFHMIN-NEXT: fcvt.h.s fa5, fa5
@@ -373,9 +373,9 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
; ZVFHMIN-NEXT: vfmv.s.f v8, fa5
; ZVFHMIN-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfredosum.vs v8, v24, v8, v0.t
; ZVFHMIN-NEXT: vfmv.f.s fa5, v8
; ZVFHMIN-NEXT: fcvt.h.s fa0, fa5
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/53/builds/4356 Here is the relevant piece of the build log for the reference
|
This allows us to reduce VLs feeding reduction instructions. In particular, this means that <3 x Ty> reduce(load) like sequences no longer require a VL toggle.
This was waiting on 3d72957; now that the latent correctness issue is fixed, we can expand this transform.