Skip to content

Commit 166ec78

Browse files
committed
[VPlan] Unroll by VF
1 parent 7f53c1c commit 166ec78

17 files changed

+261
-84
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7328,6 +7328,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73287328
// cost model is complete for better cost estimates.
73297329
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
73307330
OrigLoop->getHeader()->getContext());
7331+
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
73317332
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
73327333
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
73337334
VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,13 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
261261
return Data.VPV2Scalars[Def][0];
262262
}
263263

264+
// Look through BuildVector to avoid redundant extracts.
265+
// TODO: Remove once replicate regions are unrolled explicitly.
266+
if (Lane.getKind() == VPLane::Kind::First && match(Def, m_BuildVector())) {
267+
auto *BuildVector = cast<VPInstruction>(Def);
268+
return get(BuildVector->getOperand(Lane.getKnownLane()), true);
269+
}
270+
264271
assert(hasVectorValue(Def));
265272
auto *VecPart = Data.VPV2Vector[Def];
266273
if (!VecPart->getType()->isVectorTy()) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,13 @@ class VPInstruction : public VPRecipeWithIRFlags,
936936
BranchOnCount,
937937
BranchOnCond,
938938
Broadcast,
939+
/// Given operands of (the same) struct type, creates a struct of fixed-
940+
/// width vectors each containing a struct field of all operands. The
941+
/// number of operands matches the element count of every vector.
942+
BuildStructVector,
943+
/// Creates a fixed-width vector containing all operands. The number of
944+
/// operands matches the vector element count.
945+
BuildVector,
939946
ComputeAnyOfResult,
940947
ComputeFindLastIVResult,
941948
ComputeReductionResult,

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
110110
case VPInstruction::CalculateTripCountMinusVF:
111111
case VPInstruction::CanonicalIVIncrementForPart:
112112
case VPInstruction::AnyOf:
113+
case VPInstruction::BuildStructVector:
114+
case VPInstruction::BuildVector:
113115
return SetResultTyFromOp();
114116
case VPInstruction::FirstActiveLane:
115117
return Type::getIntNTy(Ctx, 64);

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ struct Recipe_match {
224224
if ((!matchRecipeAndOpcode<RecipeTys>(R) && ...))
225225
return false;
226226

227+
auto *VPI = dyn_cast<VPInstruction>(R);
228+
if (VPI && VPI->getOpcode() == VPInstruction::BuildVector)
229+
return true;
227230
assert(R->getNumOperands() == std::tuple_size<Ops_t>::value &&
228231
"recipe with matched opcode does not have the expected number of "
229232
"operands");
@@ -263,6 +266,10 @@ struct Recipe_match {
263266
}
264267
};
265268

269+
template <unsigned Opcode, typename... RecipeTys>
270+
using ZeroOpRecipe_match =
271+
Recipe_match<std::tuple<>, Opcode, false, RecipeTys...>;
272+
266273
template <typename Op0_t, unsigned Opcode, typename... RecipeTys>
267274
using UnaryRecipe_match =
268275
Recipe_match<std::tuple<Op0_t>, Opcode, false, RecipeTys...>;
@@ -271,6 +278,9 @@ template <typename Op0_t, unsigned Opcode>
271278
using UnaryVPInstruction_match =
272279
UnaryRecipe_match<Op0_t, Opcode, VPInstruction>;
273280

281+
template <unsigned Opcode>
282+
using ZeroOpVPInstruction_match = ZeroOpRecipe_match<Opcode, VPInstruction>;
283+
274284
template <typename Op0_t, unsigned Opcode>
275285
using AllUnaryRecipe_match =
276286
UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
@@ -302,6 +312,10 @@ using AllBinaryRecipe_match =
302312
BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe,
303313
VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>;
304314

315+
inline ZeroOpVPInstruction_match<VPInstruction::BuildVector> m_BuildVector() {
316+
return ZeroOpVPInstruction_match<VPInstruction::BuildVector>();
317+
}
318+
305319
template <unsigned Opcode, typename Op0_t>
306320
inline UnaryVPInstruction_match<Op0_t, Opcode>
307321
m_VPInstruction(const Op0_t &Op0) {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 55 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -504,9 +504,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
504504
}
505505
case Instruction::ExtractElement: {
506506
assert(State.VF.isVector() && "Only extract elements from vectors");
507-
Value *Vec = State.get(getOperand(0));
508-
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
509-
return Builder.CreateExtractElement(Vec, Idx, Name);
507+
unsigned IdxToExtract =
508+
cast<ConstantInt>(getOperand(1)->getLiveInIRValue())->getZExtValue();
509+
return State.get(getOperand(0), VPLane(IdxToExtract));
510510
}
511511
case Instruction::Freeze: {
512512
Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
@@ -617,6 +617,32 @@ Value *VPInstruction::generate(VPTransformState &State) {
617617
return Builder.CreateVectorSplat(
618618
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
619619
}
620+
case VPInstruction::BuildStructVector: {
621+
// For struct types, we need to build a new 'wide' struct type, where each
622+
// element is widened, i.e. we crate a struct of vectors .
623+
auto *StructTy =
624+
cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
625+
Value *Res = PoisonValue::get(toVectorizedTy(StructTy, State.VF));
626+
for (const auto &[Idx, Op] : enumerate(operands())) {
627+
for (unsigned I = 0; I != StructTy->getNumElements(); I++) {
628+
Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I);
629+
Value *VectorValue = Builder.CreateExtractValue(Res, I);
630+
VectorValue =
631+
Builder.CreateInsertElement(VectorValue, ScalarValue, Idx);
632+
Res = Builder.CreateInsertValue(Res, VectorValue, I);
633+
}
634+
}
635+
return Res;
636+
}
637+
case VPInstruction::BuildVector: {
638+
auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
639+
auto NumOfElements = ElementCount::getFixed(getNumOperands());
640+
Value *Res = PoisonValue::get(toVectorizedTy(ScalarTy, NumOfElements));
641+
for (const auto &[Idx, Op] : enumerate(operands()))
642+
Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
643+
State.Builder.getInt32(Idx));
644+
return Res;
645+
}
620646
case VPInstruction::ReductionStartVector: {
621647
if (State.VF.isScalar())
622648
return State.get(getOperand(0), true);
@@ -935,6 +961,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
935961
case Instruction::ICmp:
936962
case Instruction::Select:
937963
case VPInstruction::AnyOf:
964+
case VPInstruction::BuildStructVector:
965+
case VPInstruction::BuildVector:
938966
case VPInstruction::CalculateTripCountMinusVF:
939967
case VPInstruction::CanonicalIVIncrementForPart:
940968
case VPInstruction::ExtractLastElement:
@@ -1059,6 +1087,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
10591087
case VPInstruction::Broadcast:
10601088
O << "broadcast";
10611089
break;
1090+
case VPInstruction::BuildStructVector:
1091+
O << "buildstructvector";
1092+
break;
1093+
case VPInstruction::BuildVector:
1094+
O << "buildvector";
1095+
break;
10621096
case VPInstruction::ExtractLastElement:
10631097
O << "extract-last-element";
10641098
break;
@@ -2660,45 +2694,30 @@ static void scalarizeInstruction(const Instruction *Instr,
26602694

26612695
void VPReplicateRecipe::execute(VPTransformState &State) {
26622696
Instruction *UI = getUnderlyingInstr();
2663-
if (State.Lane) { // Generate a single instance.
2664-
assert((State.VF.isScalar() || !isSingleScalar()) &&
2665-
"uniform recipe shouldn't be predicated");
2666-
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
2667-
scalarizeInstruction(UI, this, *State.Lane, State);
2668-
// Insert scalar instance packing it into a vector.
2669-
if (State.VF.isVector() && shouldPack()) {
2670-
Value *WideValue;
2671-
// If we're constructing lane 0, initialize to start from poison.
2672-
if (State.Lane->isFirstLane()) {
2673-
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2674-
WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF));
2675-
} else {
2676-
WideValue = State.get(this);
2677-
}
2678-
State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
2679-
*State.Lane));
2680-
}
2681-
return;
2682-
}
26832697

2684-
if (IsSingleScalar) {
2685-
// Uniform within VL means we need to generate lane 0.
2698+
if (!State.Lane) {
2699+
assert(IsSingleScalar &&
2700+
"VPReplicateRecipes outside replicate regions must be unrolled");
26862701
scalarizeInstruction(UI, this, VPLane(0), State);
26872702
return;
26882703
}
26892704

2690-
// A store of a loop varying value to a uniform address only needs the last
2691-
// copy of the store.
2692-
if (isa<StoreInst>(UI) && vputils::isSingleScalar(getOperand(1))) {
2693-
auto Lane = VPLane::getLastLaneForVF(State.VF);
2694-
scalarizeInstruction(UI, this, VPLane(Lane), State);
2695-
return;
2705+
assert((State.VF.isScalar() || !isSingleScalar()) &&
2706+
"uniform recipe shouldn't be predicated");
2707+
scalarizeInstruction(UI, this, *State.Lane, State);
2708+
// Insert scalar instance packing it into a vector.
2709+
if (State.VF.isVector() && shouldPack()) {
2710+
Value *WideValue;
2711+
// If we're constructing lane 0, initialize to start from poison.
2712+
if (State.Lane->isFirstLane()) {
2713+
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2714+
WideValue = PoisonValue::get(VectorType::get(UI->getType(), State.VF));
2715+
} else {
2716+
WideValue = State.get(this);
2717+
}
2718+
State.set(this, State.packScalarIntoVectorizedValue(this, WideValue,
2719+
*State.Lane));
26962720
}
2697-
2698-
// Generate scalar instances for all VF lanes.
2699-
const unsigned EndLane = State.VF.getFixedValue();
2700-
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
2701-
scalarizeInstruction(UI, this, VPLane(Lane), State);
27022721
}
27032722

27042723
bool VPReplicateRecipe::shouldPack() const {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,6 +1140,24 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
11401140
return;
11411141
}
11421142

1143+
// Look through ExtractLastElement (BuildVector ....).
1144+
if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(
1145+
m_BuildVector()))) {
1146+
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1147+
Def->replaceAllUsesWith(
1148+
BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1149+
return;
1150+
}
1151+
1152+
// Look through ExtractPenultimateElement (BuildVector ....).
1153+
if (match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1154+
m_BuildVector()))) {
1155+
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
1156+
Def->replaceAllUsesWith(
1157+
BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1158+
return;
1159+
}
1160+
11431161
// Some simplifications can only be applied after unrolling. Perform them
11441162
// below.
11451163
if (!Plan->isUnrolled())

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,12 @@ struct VPlanTransforms {
9999
/// Explicitly unroll \p Plan by \p UF.
100100
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
101101

102+
/// Replace each VPReplicateRecipe outside on any replicate region in \p Plan
103+
/// with \p VF single-scalar recipes.
104+
/// TODO: Also replicate VPReplicateRecipes inside replicate regions, thereby
105+
/// dissolving the latter.
106+
static void replicateByVF(VPlan &Plan, ElementCount VF);
107+
102108
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
103109
/// resulting plan to \p BestVF and \p BestUF.
104110
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "VPlan.h"
1616
#include "VPlanAnalysis.h"
1717
#include "VPlanCFG.h"
18+
#include "VPlanHelpers.h"
1819
#include "VPlanPatternMatch.h"
1920
#include "VPlanTransforms.h"
2021
#include "VPlanUtils.h"
@@ -450,3 +451,127 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
450451

451452
VPlanTransforms::removeDeadRecipes(Plan);
452453
}
454+
455+
struct VPReplicateUnroller {
456+
VPlan &Plan;
457+
Type *IdxTy;
458+
DenseMap<VPValue *, SmallVector<VPValue *>> Rep2LaneDefs;
459+
460+
VPReplicateUnroller(VPlan &Plan, Type *IdxTy) : Plan(Plan), IdxTy(IdxTy) {}
461+
462+
void addLaneDef(VPValue *Def, VPValue *LaneDef) {
463+
const auto &[LaneDefs, _] = Rep2LaneDefs.insert({Def, {}});
464+
LaneDefs->second.push_back(LaneDef);
465+
}
466+
467+
VPValue *getLane(VPValue *Op, const VPLane &Lane, VPBuilder &Builder) {
468+
const auto &LaneDefs = Rep2LaneDefs.lookup(Op);
469+
if (LaneDefs.empty()) {
470+
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
471+
return Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op});
472+
}
473+
VPValue *Idx =
474+
Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
475+
return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
476+
}
477+
assert(Lane.getKind() != VPLane::Kind::ScalableLast);
478+
return LaneDefs[Lane.getKnownLane()];
479+
}
480+
481+
VPValue *getLane0(VPReplicateRecipe *RepR) {
482+
VPBuilder B;
483+
return getLane(RepR, VPLane::getFirstLane(), B);
484+
}
485+
};
486+
487+
/// Create a single-scalar clone of \p RepR for lane \p Lane.
488+
static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
489+
Type *IdxTy, VPReplicateRecipe *RepR,
490+
VPLane Lane,
491+
VPReplicateUnroller &State) {
492+
// Collect the operands at Lane, creating extracts as needed.
493+
SmallVector<VPValue *> NewOps;
494+
for (VPValue *Op : RepR->operands()) {
495+
if (vputils::isSingleScalar(Op)) {
496+
NewOps.push_back(Op);
497+
continue;
498+
}
499+
NewOps.push_back(State.getLane(Op, Lane, Builder));
500+
}
501+
502+
auto *New =
503+
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
504+
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
505+
New->insertBefore(RepR);
506+
return New;
507+
}
508+
509+
void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
510+
Type *IdxTy = IntegerType::get(
511+
Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
512+
VPReplicateUnroller State(Plan, IdxTy);
513+
SmallVector<VPRecipeBase *> ToRemove;
514+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
515+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
516+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
517+
auto *Pack = dyn_cast<VPInstruction>(&R);
518+
if (Pack && Pack->getOpcode() == VPInstruction::Pack) {
519+
auto LaneDefs = State.Rep2LaneDefs.lookup(Pack->getOperand(0));
520+
if (!LaneDefs.empty()) {
521+
auto *RepR = cast<VPReplicateRecipe>(Pack->getOperand(0));
522+
// If needed, create a Build(Struct)Vector recipe to insert the scalar
523+
// lane values into a vector.
524+
Type *ResTy = RepR->getUnderlyingInstr()->getType();
525+
VPBuilder Builder(Pack);
526+
VPValue *VecRes = Builder.createNaryOp(
527+
ResTy->isStructTy() ? VPInstruction::BuildStructVector
528+
: VPInstruction::BuildVector,
529+
LaneDefs);
530+
Pack->replaceAllUsesWith(VecRes);
531+
Pack->eraseFromParent();
532+
} else {
533+
assert(!isa<VPReplicateRecipe>(Pack->getOperand(0)));
534+
}
535+
continue;
536+
}
537+
if (Pack && Pack->getOpcode() == VPInstruction::Unpack) {
538+
VPBuilder Builder(Pack);
539+
540+
auto *Def = Pack->getOperand(0);
541+
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) {
542+
VPValue *Idx = Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, I));
543+
State.addLaneDef(Pack, Builder.createNaryOp(
544+
Instruction::ExtractElement, {Def, Idx}));
545+
}
546+
continue;
547+
}
548+
549+
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
550+
if (!RepR || RepR->isSingleScalar())
551+
continue;
552+
553+
VPBuilder Builder(RepR);
554+
ToRemove.push_back(RepR);
555+
// Stores to invariant addresses need to store the last lane only.
556+
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
557+
vputils::isSingleScalar(RepR->getOperand(1))) {
558+
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
559+
State);
560+
continue;
561+
}
562+
563+
/// Create single-scalar version of RepR for all lanes.
564+
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
565+
State.addLaneDef(
566+
RepR, cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), State));
567+
568+
/// Users that only demand the first lane can use the definition for lane
569+
/// 0.
570+
RepR->replaceUsesWithIf(
571+
State.getLane0(RepR),
572+
[RepR](VPUser &U, unsigned) { return U.onlyFirstLaneUsed(RepR); });
573+
}
574+
}
575+
for (auto *R : reverse(ToRemove))
576+
R->eraseFromParent();
577+
}

0 commit comments

Comments
 (0)