Skip to content

Commit d7fc1fd

Browse files
committed
[VPlan] Use VPWidenIntrinsicRecipe to vp.select.
Use VPWidenIntrinsicRecipe (#110486) to create vp.select intrinsics. This potentially offers an alternative to duplicating EVL recipes for all existing recipes. There are some recipes that will need duplicates (at least at the moment), due to extra code-gen needs (e.g. widening loads and stores). But in cases the intrinsic can directly be used, creating the widened intrinsic directly would reduce the need to duplicate some recipes. NOTE: this PR contains the changes from #110486) The relevant changes are in 47135542e2ada3f21b215bf237d8442a56c8456c.
1 parent 8ae59c7 commit d7fc1fd

File tree

7 files changed

+104
-4
lines changed

7 files changed

+104
-4
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1624,13 +1624,22 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
16241624
DebugLoc DL = {})
16251625
: VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
16261626
VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {}
1627+
template <typename IterT>
1628+
VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
1629+
iterator_range<IterT> CallArguments, Type *Ty,
1630+
DebugLoc DL = {})
1631+
: VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
1632+
VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {}
16271633

16281634
~VPWidenIntrinsicRecipe() override = default;
16291635

16301636
VPWidenIntrinsicRecipe *clone() override {
1631-
return new VPWidenIntrinsicRecipe(*cast<CallInst>(getUnderlyingValue()),
1632-
VectorIntrinsicID, operands(), ResultTy,
1633-
getDebugLoc());
1637+
return isa_and_nonnull<CallInst>(getUnderlyingValue())
1638+
? new VPWidenIntrinsicRecipe(
1639+
*cast<CallInst>(getUnderlyingValue()), VectorIntrinsicID,
1640+
operands(), ResultTy, getDebugLoc())
1641+
: new VPWidenIntrinsicRecipe(VectorIntrinsicID, operands(),
1642+
ResultTy, getDebugLoc());
16341643
}
16351644

16361645
VP_CLASSOF_IMPL(VPDef::VPWidenIntrinsicSC)
@@ -1652,6 +1661,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
16521661
void print(raw_ostream &O, const Twine &Indent,
16531662
VPSlotTracker &SlotTracker) const override;
16541663
#endif
1664+
1665+
bool onlyFirstLaneUsed(const VPValue *Op) const override;
16551666
};
16561667

16571668
/// A recipe for widening Call instructions using library calls.

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
6161
case Instruction::ICmp:
6262
case VPInstruction::ActiveLaneMask:
6363
return inferScalarType(R->getOperand(1));
64+
case VPInstruction::ExplicitVectorLength:
65+
return Type::getIntNTy(Ctx, 32);
6466
case VPInstruction::FirstOrderRecurrenceSplice:
6567
case VPInstruction::Not:
6668
return SetResultTyFromOp();

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,14 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
10391039
return Intrinsic::getBaseName(VectorIntrinsicID);
10401040
}
10411041

1042+
bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1043+
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1044+
// Vector predication intrinsics only demand the the first lane the last
1045+
// operand (the EVL operand).
1046+
return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
1047+
Op == getOperand(getNumOperands() - 1);
1048+
}
1049+
10421050
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
10431051
void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
10441052
VPSlotTracker &SlotTracker) const {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1353,6 +1353,7 @@ void VPlanTransforms::addActiveLaneMask(
13531353
/// Replace recipes with their EVL variants.
13541354
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
13551355
SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
1356+
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
13561357
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
13571358
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
13581359
auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
@@ -1384,6 +1385,15 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
13841385
VPValue *NewMask = GetNewMask(Red->getCondOp());
13851386
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
13861387
})
1388+
.Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
1389+
SmallVector<VPValue *> Ops(Sel->operands());
1390+
Ops.push_back(&EVL);
1391+
return new VPWidenIntrinsicRecipe(
1392+
Intrinsic::vp_select, make_range(Ops.begin(), Ops.end()),
1393+
1394+
TypeInfo.inferScalarType(Sel));
1395+
})
1396+
13871397
.Default([&](VPRecipeBase *R) { return nullptr; });
13881398

13891399
if (!NewRecipe)

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
138138
};
139139
for (const VPUser *U : EVL.users()) {
140140
if (!TypeSwitch<const VPUser *, bool>(U)
141+
.Case<VPWidenIntrinsicRecipe>(
142+
[&](const VPWidenIntrinsicRecipe *S) {
143+
return VerifyEVLUse(*S, S->getNumOperands() - 1);
144+
})
141145
.Case<VPWidenStoreEVLRecipe>([&](const VPWidenStoreEVLRecipe *S) {
142146
return VerifyEVLUse(*S, 2);
143147
})

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
7070
; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
7171
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
7272
; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
73-
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
73+
; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
7474
; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
7575
; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
7676
; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; REQUIRES: asserts
2+
3+
; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
4+
; RUN: -force-tail-folding-style=data-with-evl \
5+
; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
6+
; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
7+
8+
define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
9+
; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
10+
; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
11+
; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
12+
; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
13+
14+
; IF-EVL: vector.ph:
15+
; IF-EVL-NEXT: Successor(s): vector loop
16+
17+
; IF-EVL: <x1> vector loop: {
18+
; IF-EVL-NEXT: vector.body:
19+
; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
20+
; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
21+
; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
22+
; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
23+
; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
24+
; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
25+
; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
26+
; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
27+
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
28+
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
29+
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
30+
; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
31+
; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]>
32+
; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>)
33+
; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>
34+
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
35+
; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
36+
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
37+
; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
38+
; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
39+
; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
40+
; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
41+
; IF-EVL-NEXT: No successors
42+
; IF-EVL-NEXT: }
43+
44+
entry:
45+
br label %for.body
46+
47+
for.body:
48+
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
49+
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
50+
%0 = load i32, ptr %arrayidx, align 4
51+
%arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
52+
%1 = load i32, ptr %arrayidx3, align 4
53+
%cmp4 = icmp sgt i32 %0, %1
54+
%2 = sub i32 0, %1
55+
%cond.p = select i1 %cmp4, i32 %1, i32 %2
56+
%cond = add i32 %cond.p, %0
57+
%arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
58+
store i32 %cond, ptr %arrayidx15, align 4
59+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
60+
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
61+
br i1 %exitcond.not, label %exit, label %for.body
62+
63+
exit:
64+
ret void
65+
}

0 commit comments

Comments
 (0)