Skip to content

Commit 89d4f13

Browse files
committed
[VPlan]
1 parent b9b4fc2 commit 89d4f13

File tree

6 files changed

+154
-29
lines changed

6 files changed

+154
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7697,6 +7697,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
76977697
OrigLoop->getHeader()->getContext());
76987698
VPlanTransforms::materializeBroadcasts(BestVPlan);
76997699
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7700+
7701+
VPlanTransforms::narrowInterleaveGroups(
7702+
BestVPlan, BestVF,
7703+
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
77007704
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
77017705
VPlanTransforms::removeDeadRecipes(BestVPlan);
77027706
VPlanTransforms::convertToConcreteRecipes(BestVPlan);

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -900,8 +900,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
900900

901901
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
902902
// FIXME: Model VF * UF computation completely in VPlan.
903-
assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) &&
904-
"VFxUF expected to always have users");
905903
unsigned UF = getUF();
906904
if (VF.getNumUsers()) {
907905
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,11 @@ bool VPRecipeBase::mayWriteToMemory() const {
7272
case VPBranchOnMaskSC:
7373
case VPScalarIVStepsSC:
7474
case VPPredInstPHISC:
75+
case VPVectorPointerSC:
7576
return false;
7677
case VPBlendSC:
7778
case VPReductionEVLSC:
7879
case VPReductionSC:
79-
case VPVectorPointerSC:
8080
case VPWidenCanonicalIVSC:
8181
case VPWidenCastSC:
8282
case VPWidenGEPSC:

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030
#include "llvm/IR/Intrinsics.h"
3131
#include "llvm/IR/PatternMatch.h"
3232

33+
#define LV_NAME "loop-vectorize"
34+
#define DEBUG_TYPE LV_NAME
35+
3336
using namespace llvm;
3437

3538
void VPlanTransforms::VPInstructionsToVPRecipes(
@@ -2229,3 +2232,130 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
22292232
});
22302233
}
22312234
}
2235+
2236+
/// Returns true if \p IR is a full interleave group with factor and number of
2237+
/// members both equal to \p VF. The interleave group must also access the full
2238+
/// vector width \p VectorRegWidth.
2239+
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
2240+
unsigned VF, VPTypeAnalysis &TypeInfo,
2241+
unsigned VectorRegWidth) {
2242+
if (!InterleaveR)
2243+
return false;
2244+
Type *GroupElementTy = nullptr;
2245+
if (InterleaveR->getStoredValues().empty()) {
2246+
GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
2247+
if (!all_of(InterleaveR->definedValues(),
2248+
[&TypeInfo, GroupElementTy](VPValue *Op) {
2249+
return TypeInfo.inferScalarType(Op) == GroupElementTy;
2250+
}))
2251+
return false;
2252+
} else {
2253+
GroupElementTy =
2254+
TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
2255+
if (!all_of(InterleaveR->getStoredValues(),
2256+
[&TypeInfo, GroupElementTy](VPValue *Op) {
2257+
return TypeInfo.inferScalarType(Op) == GroupElementTy;
2258+
}))
2259+
return false;
2260+
}
2261+
2262+
unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
2263+
2264+
auto IG = InterleaveR->getInterleaveGroup();
2265+
return IG->getFactor() == VF && IG->getNumMembers() == VF &&
2266+
GroupSize == VectorRegWidth;
2267+
}
2268+
2269+
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
2270+
unsigned VectorRegWidth) {
2271+
using namespace llvm::VPlanPatternMatch;
2272+
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
2273+
if (VF.isScalable() || !VectorLoop)
2274+
return;
2275+
2276+
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
2277+
Type *CanonicalIVType = CanonicalIV->getScalarType();
2278+
VPTypeAnalysis TypeInfo(CanonicalIVType);
2279+
2280+
unsigned FixedVF = VF.getFixedValue();
2281+
SmallVector<VPInterleaveRecipe *> StoreGroups;
2282+
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
2283+
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
2284+
match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
2285+
continue;
2286+
2287+
// Bail out on recipes not supported at the moment:
2288+
// * phi recipes other than the canonical induction
2289+
// * recipes writing to memory except interleave groups
2290+
// Only support plans with a canonical induction phi.
2291+
if (R.isPhi())
2292+
return;
2293+
2294+
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
2295+
if (R.mayWriteToMemory() && !InterleaveR)
2296+
return;
2297+
2298+
if (!InterleaveR)
2299+
continue;
2300+
2301+
// Bail out on non-consecutive interleave groups.
2302+
if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
2303+
VectorRegWidth))
2304+
return;
2305+
2306+
// Skip read interleave groups.
2307+
if (InterleaveR->getStoredValues().empty())
2308+
continue;
2309+
2310+
if (!all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
2311+
VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
2312+
if (!DefR)
2313+
return false;
2314+
auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
2315+
return IR &&
2316+
IR->getInterleaveGroup()->getFactor() ==
2317+
IR->getInterleaveGroup()->getNumMembers() &&
2318+
IR->getVPValue(Op.index()) == Op.value();
2319+
})) {
2320+
return;
2321+
}
2322+
StoreGroups.push_back(InterleaveR);
2323+
}
2324+
2325+
if (StoreGroups.empty())
2326+
return;
2327+
2328+
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
2329+
auto Narrow = [](VPRecipeBase *R) -> VPValue * {
2330+
auto *LoadGroup = cast<VPInterleaveRecipe>(R);
2331+
// Narrow interleave group to wide load, as transformed VPlan will only
2332+
// process one original iteration.
2333+
auto *L = new VPWidenLoadRecipe(
2334+
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
2335+
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
2336+
/*Reverse=*/false, LoadGroup->getDebugLoc());
2337+
L->insertBefore(LoadGroup);
2338+
return L;
2339+
};
2340+
2341+
// Narrow operation tree rooted at store groups.
2342+
for (auto *StoreGroup : StoreGroups) {
2343+
VPValue *Res =
2344+
Narrow(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
2345+
2346+
auto *S = new VPWidenStoreRecipe(
2347+
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
2348+
StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
2349+
/*Reverse=*/false, StoreGroup->getDebugLoc());
2350+
S->insertBefore(StoreGroup);
2351+
StoreGroup->eraseFromParent();
2352+
}
2353+
2354+
// Adjust induction to reflect that the transformed plan only processes one
2355+
// original iteration.
2356+
auto *CanIV = Plan.getCanonicalIV();
2357+
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
2358+
Inc->setOperand(
2359+
1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
2360+
removeDeadRecipes(Plan);
2361+
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,15 @@ struct VPlanTransforms {
191191

192192
/// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
193193
static void materializeBroadcasts(VPlan &Plan);
194+
195+
/// Try to convert a plan with interleave groups with VF elements to a plan
196+
/// with the interleave groups replaced by wide loads and stores processing VF
197+
/// elements, if all transformed interleave groups access the full vector
198+
/// width (checked via \o VectorRegWidth). This effectively is a very simple
199+
/// form of loop-aware SLP, where we use interleave groups to identify
200+
/// candidates.
201+
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
202+
unsigned VectorRegWidth);
194203
};
195204

196205
} // namespace llvm

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,11 @@ define void @load_store_interleave_group(ptr noalias %data) {
2020
; VF2-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
2121
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
2222
; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
23-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
24-
; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
25-
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
26-
; VF2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
27-
; VF2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
28-
; VF2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
29-
; VF2-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
30-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
31-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
32-
; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
33-
; VF2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
34-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP5]], align 8
35-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
23+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
24+
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
25+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP2]], align 8
26+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP5]], align 8
27+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
3628
; VF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
3729
; VF2-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
3830
; VF2: [[MIDDLE_BLOCK]]:
@@ -133,21 +125,13 @@ define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr
133125
; VF2-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
134126
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
135127
; VF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP8]]
136-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
137-
; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
138-
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
139-
; VF2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
140-
; VF2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
141-
; VF2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
128+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
129+
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
142130
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
143131
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP8]]
144-
; VF2-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
145-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
146-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
147-
; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
148-
; VF2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
149-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP7]], align 8
150-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
132+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP3]], align 8
133+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP7]], align 8
134+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
151135
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
152136
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
153137
; VF2: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)