Skip to content

Commit cab6f07

Browse files
committed
[VPlan] Support VPWidenIntOrFpInductionRecipes with EVL tail folding
Following on from llvm#118638, this handles widened induction variables with EVL tail folding by setting the VF operand to be EVL, calculated in the vector body. We need to do this for correctness since with EVL tail folding the number of elements processed in the penultimate iteration may not be VF, but the runtime EVL, and we need to increment induction variables as such. - Because the VF may now not be a live-in we need to move the builder to just after its definition - We also need to avoid truncating it when it's the same size as the step type, previously this wasn't a problem for live-ins. - Also because the VF may be smaller than the IV type, since the EVL is always i32, we may need to zext it. On -march=rva23u64 -O3 we get 87.1% more loops vectorized on TSVC, and 42.8% more loops vectorized on SPEC CPU 2017
1 parent a981134 commit cab6f07

11 files changed

+576
-146
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2006,6 +2006,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
20062006

20072007
VPValue *getVFValue() { return getOperand(2); }
20082008
const VPValue *getVFValue() const { return getOperand(2); }
2009+
void setVFValue(VPValue *New) { return setOperand(2, New); }
20092010

20102011
VPValue *getSplatVFValue() {
20112012
// If the recipe has been unrolled return the VPValue for the induction

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2196,6 +2196,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
21962196
for (VPUser *U : to_vector(Plan.getVF().users())) {
21972197
if (auto *R = dyn_cast<VPVectorEndPointerRecipe>(U))
21982198
R->setOperand(1, &EVL);
2199+
if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(U))
2200+
R->setVFValue(&EVL);
21992201
}
22002202

22012203
SmallVector<VPRecipeBase *> ToErase;
@@ -2277,11 +2279,10 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
22772279
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
22782280
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
22792281
// The transform updates all users of inductions to work based on EVL, instead
2280-
// of the VF directly. At the moment, widened inductions cannot be updated, so
2281-
// bail out if the plan contains any.
2282-
bool ContainsWidenInductions = any_of(
2283-
Header->phis(),
2284-
IsaPred<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>);
2282+
// of the VF directly. At the moment, widened pointer inductions cannot be
2283+
// updated, so bail out if the plan contains any.
2284+
bool ContainsWidenInductions =
2285+
any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>);
22852286
if (ContainsWidenInductions)
22862287
return false;
22872288

@@ -2604,14 +2605,19 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
26042605
Inc = SplatVF;
26052606
Prev = WidenIVR->getLastUnrolledPartOperand();
26062607
} else {
2608+
if (VPRecipeBase *R = VF->getDefiningRecipe())
2609+
Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
2610+
Type *VFTy = TypeInfo.inferScalarType(VF);
26072611
// Multiply the vectorization factor by the step using integer or
26082612
// floating-point arithmetic as appropriate.
26092613
if (StepTy->isFloatingPointTy())
26102614
VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
26112615
DL);
2612-
else
2616+
else if (VFTy->getScalarSizeInBits() > StepTy->getScalarSizeInBits())
26132617
VF =
26142618
Builder.createScalarCast(Instruction::CastOps::Trunc, VF, StepTy, DL);
2619+
else if (VFTy->getScalarSizeInBits() < StepTy->getScalarSizeInBits())
2620+
VF = Builder.createScalarCast(Instruction::CastOps::ZExt, VF, StepTy, DL);
26152621

26162622
Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
26172623
Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
156156
.Case<VPWidenIntrinsicRecipe>([&](const VPWidenIntrinsicRecipe *S) {
157157
return VerifyEVLUse(*S, S->getNumOperands() - 1);
158158
})
159-
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe>(
159+
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
160+
VPWidenIntOrFpInductionRecipe>(
160161
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
161162
.Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
162163
[&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
@@ -165,18 +166,30 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
165166
.Case<VPInstruction>([&](const VPInstruction *I) {
166167
if (I->getOpcode() == Instruction::PHI)
167168
return VerifyEVLUse(*I, 1);
168-
if (I->getOpcode() != Instruction::Add) {
169-
errs() << "EVL is used as an operand in non-VPInstruction::Add\n";
169+
switch (I->getOpcode()) {
170+
case Instruction::Add:
171+
break;
172+
case Instruction::UIToFP:
173+
case Instruction::Trunc:
174+
case Instruction::ZExt:
175+
case Instruction::Mul:
176+
case Instruction::FMul:
177+
if (!VerifyLate) {
178+
errs() << "EVL used by unexpected VPInstruction\n";
179+
return false;
180+
}
181+
break;
182+
default:
183+
errs() << "EVL used by unexpected VPInstruction\n";
170184
return false;
171185
}
172186
if (I->getNumUsers() != 1) {
173-
errs() << "EVL is used in VPInstruction:Add with multiple "
174-
"users\n";
187+
errs() << "EVL is used in VPInstruction with multiple users\n";
175188
return false;
176189
}
177190
if (!VerifyLate && !isa<VPEVLBasedIVPHIRecipe>(*I->users().begin())) {
178-
errs() << "Result of VPInstruction::Add with EVL operand is "
179-
"not used by VPEVLBasedIVPHIRecipe\n";
191+
errs() << "Result of VPInstruction with EVL operand is not used by "
192+
"VPEVLBasedIVPHIRecipe\n";
180193
return false;
181194
}
182195
return true;

llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,55 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
88
; CHECK-LABEL: define void @test_wide_integer_induction(
99
; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
1010
; CHECK-NEXT: entry:
11+
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]]
12+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
13+
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
14+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
15+
; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
16+
; CHECK: vector.ph:
17+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
18+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
19+
; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
20+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
21+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
22+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
23+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
24+
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
25+
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
26+
; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i64> [[TMP9]], splat (i64 1)
27+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
1128
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
29+
; CHECK: vector.body:
30+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
31+
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
32+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
33+
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
34+
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
35+
; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
36+
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]]
37+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP13]], i64 0
38+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
39+
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
40+
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
41+
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
42+
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
43+
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
44+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
45+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
46+
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
47+
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
48+
; CHECK: middle.block:
49+
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
50+
; CHECK: scalar.ph:
51+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
52+
; CHECK-NEXT: br label [[FOR_BODY1:%.*]]
1253
; CHECK: for.body:
13-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
14-
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
15-
; CHECK-NEXT: store i64 [[IV]], ptr [[ARRAYIDX]], align 8
16-
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
17-
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
18-
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
54+
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
55+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
56+
; CHECK-NEXT: store i64 [[IV1]], ptr [[ARRAYIDX]], align 8
57+
; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
58+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
59+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
1960
; CHECK: for.cond.cleanup:
2061
; CHECK-NEXT: ret void
2162
;
@@ -68,3 +109,10 @@ for.body:
68109
for.cond.cleanup:
69110
ret void
70111
}
112+
;.
113+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
114+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
115+
; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
116+
; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
117+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]}
118+
;.

0 commit comments

Comments
 (0)