Skip to content

Commit 85cb01c

Browse files
committed
[VPlan] Unroll VPRedplicateRecipes by VF.
1 parent dec8f13 commit 85cb01c

16 files changed

+195
-69
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7557,6 +7557,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
75577557
// cost model is complete for better cost estimates.
75587558
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
75597559
OrigLoop->getHeader()->getContext());
7560+
VPlanTransforms::runPass(VPlanTransforms::unrollByVF, BestVPlan, BestVF);
75607561
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
75617562
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
75627563
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,14 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
261261
return Data.VPV2Scalars[Def][0];
262262
}
263263

264+
// Look through BuildVector to avoid redundant extracts.
265+
// TODO: Remove once replicate regions are unrolled explicitly.
266+
auto *BV = dyn_cast<VPInstruction>(Def);
267+
if (Lane.getKind() == VPLane::Kind::First && BV &&
268+
BV->getOpcode() == VPInstruction::BuildVector) {
269+
return get(BV->getOperand(Lane.getKnownLane()), true);
270+
}
271+
264272
assert(hasVectorValue(Def));
265273
auto *VecPart = Data.VPV2Vector[Def];
266274
if (!VecPart->getType()->isVectorTy()) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,12 @@ class VPInstruction : public VPRecipeWithIRFlags,
907907
BranchOnCount,
908908
BranchOnCond,
909909
Broadcast,
910+
/// Creates a vector containing all operands. The vector element count
911+
/// matches the number of operands.
912+
BuildVector,
913+
/// Creates a struct of vectors containing all operands. The vector element
914+
/// count matches the number of operands.
915+
BuildStructVector,
910916
ComputeFindLastIVResult,
911917
ComputeReductionResult,
912918
// Extracts the last lane from its operand if it is a vector, or the last

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
104104
case VPInstruction::CalculateTripCountMinusVF:
105105
case VPInstruction::CanonicalIVIncrementForPart:
106106
case VPInstruction::AnyOf:
107+
case VPInstruction::BuildVector:
108+
case VPInstruction::BuildStructVector:
107109
return SetResultTyFromOp();
108110
case VPInstruction::FirstActiveLane:
109111
return Type::getIntNTy(Ctx, 64);

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
493493
}
494494
case Instruction::ExtractElement: {
495495
assert(State.VF.isVector() && "Only extract elements from vectors");
496+
return State.get(getOperand(0),
497+
VPLane(cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
498+
->getZExtValue()));
496499
Value *Vec = State.get(getOperand(0));
497500
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
498501
return Builder.CreateExtractElement(Vec, Idx, Name);
@@ -604,6 +607,34 @@ Value *VPInstruction::generate(VPTransformState &State) {
604607
return Builder.CreateVectorSplat(
605608
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
606609
}
610+
case VPInstruction::BuildVector: {
611+
auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
612+
Value *Res = PoisonValue::get(
613+
toVectorizedTy(ScalarTy, ElementCount::getFixed(getNumOperands())));
614+
for (const auto &[Idx, Op] : enumerate(operands()))
615+
Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
616+
State.Builder.getInt32(Idx));
617+
return Res;
618+
}
619+
case VPInstruction::BuildStructVector: {
620+
// For struct types, we need to build a new 'wide' struct type, where each
621+
// element is widened.
622+
auto *STy =
623+
cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
624+
Value *Res = PoisonValue::get(
625+
toVectorizedTy(STy, ElementCount::getFixed(getNumOperands())));
626+
for (const auto &[Idx, Op] : enumerate(operands())) {
627+
for (unsigned I = 0, E = STy->getNumElements(); I != E; I++) {
628+
Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I);
629+
Value *VectorValue = Builder.CreateExtractValue(Res, I);
630+
VectorValue =
631+
Builder.CreateInsertElement(VectorValue, ScalarValue, Idx);
632+
Res = Builder.CreateInsertValue(Res, VectorValue, I);
633+
}
634+
}
635+
return Res;
636+
}
637+
607638
case VPInstruction::ComputeFindLastIVResult: {
608639
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
609640
// and will be removed by breaking up the recipe further.
@@ -864,10 +895,11 @@ void VPInstruction::execute(VPTransformState &State) {
864895
if (!hasResult())
865896
return;
866897
assert(GeneratedValue && "generate must produce a value");
867-
assert(
868-
(GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
869-
State.VF.isScalar()) &&
870-
"scalar value but not only first lane defined");
898+
assert((((GeneratedValue->getType()->isVectorTy() ||
899+
GeneratedValue->getType()->isStructTy()) ==
900+
!GeneratesPerFirstLaneOnly) ||
901+
State.VF.isScalar()) &&
902+
"scalar value but not only first lane defined");
871903
State.set(this, GeneratedValue,
872904
/*IsScalar*/ GeneratesPerFirstLaneOnly);
873905
}
@@ -881,6 +913,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
881913
case Instruction::ICmp:
882914
case Instruction::Select:
883915
case VPInstruction::AnyOf:
916+
case VPInstruction::BuildVector:
917+
case VPInstruction::BuildStructVector:
884918
case VPInstruction::CalculateTripCountMinusVF:
885919
case VPInstruction::CanonicalIVIncrementForPart:
886920
case VPInstruction::ExtractLastElement:
@@ -999,6 +1033,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
9991033
case VPInstruction::Broadcast:
10001034
O << "broadcast";
10011035
break;
1036+
case VPInstruction::BuildVector:
1037+
O << "buildvector";
1038+
break;
1039+
case VPInstruction::BuildStructVector:
1040+
O << "buildstructvector";
1041+
break;
10021042
case VPInstruction::ExtractLastElement:
10031043
O << "extract-last-element";
10041044
break;
@@ -2758,20 +2798,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
27582798
scalarizeInstruction(UI, this, VPLane(0), State);
27592799
return;
27602800
}
2761-
2762-
// A store of a loop varying value to a uniform address only needs the last
2763-
// copy of the store.
2764-
if (isa<StoreInst>(UI) && vputils::isSingleScalar(getOperand(1))) {
2765-
auto Lane = VPLane::getLastLaneForVF(State.VF);
2766-
scalarizeInstruction(UI, this, VPLane(Lane), State);
2767-
return;
2768-
}
2769-
2770-
// Generate scalar instances for all VF lanes.
2771-
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
2772-
const unsigned EndLane = State.VF.getKnownMinValue();
2773-
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
2774-
scalarizeInstruction(UI, this, VPLane(Lane), State);
27752801
}
27762802

27772803
bool VPReplicateRecipe::shouldPack() const {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,22 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
11421142
return;
11431143
}
11441144

1145+
// Look through Extract(Last|Penultimate)Element (BuildVector ....).
1146+
if (match(&R,
1147+
m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) ||
1148+
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1149+
m_VPValue(A)))) {
1150+
unsigned Offset = cast<VPInstruction>(&R)->getOpcode() ==
1151+
VPInstruction::ExtractLastElement
1152+
? 1
1153+
: 2;
1154+
auto *BV = dyn_cast<VPInstruction>(A);
1155+
if (BV && BV->getOpcode() == VPInstruction::BuildVector) {
1156+
Def->replaceAllUsesWith(BV->getOperand(BV->getNumOperands() - Offset));
1157+
return;
1158+
}
1159+
}
1160+
11451161
// Some simplifications can only be applied after unrolling. Perform them
11461162
// below.
11471163
if (!Plan->isUnrolled())

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ struct VPlanTransforms {
9999
/// Explicitly unroll \p Plan by \p UF.
100100
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
101101

102+
/// Explicitly unroll VPReplicateRecipes outside of replicate regions by \p
103+
/// VF.
104+
static void unrollByVF(VPlan &Plan, ElementCount VF);
105+
102106
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
103107
/// resulting plan to \p BestVF and \p BestUF.
104108
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "VPlan.h"
1616
#include "VPlanAnalysis.h"
1717
#include "VPlanCFG.h"
18+
#include "VPlanHelpers.h"
1819
#include "VPlanPatternMatch.h"
1920
#include "VPlanTransforms.h"
2021
#include "VPlanUtils.h"
@@ -428,3 +429,83 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
428429

429430
VPlanTransforms::removeDeadRecipes(Plan);
430431
}
432+
433+
/// Create a single-scalar clone of RepR for lane \p Lane.
434+
static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
435+
Type *IdxTy, VPReplicateRecipe *RepR,
436+
VPLane Lane) {
437+
// Collect the operands at Lane, creating extracts as needed.
438+
SmallVector<VPValue *> NewOps;
439+
for (VPValue *Op : RepR->operands()) {
440+
if (vputils::isSingleScalar(Op)) {
441+
NewOps.push_back(Op);
442+
continue;
443+
}
444+
VPValue *Ext;
445+
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
446+
Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op});
447+
} else {
448+
// Look through buildvector to avoid unnecessary extracts.
449+
auto *BV = dyn_cast<VPInstruction>(Op);
450+
if (BV && BV->getOpcode() == VPInstruction::BuildVector) {
451+
NewOps.push_back(BV->getOperand(Lane.getKnownLane()));
452+
continue;
453+
}
454+
VPValue *Idx =
455+
Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
456+
Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
457+
}
458+
NewOps.push_back(Ext);
459+
}
460+
461+
auto *New =
462+
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
463+
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
464+
New->insertBefore(RepR);
465+
return New;
466+
}
467+
468+
void VPlanTransforms::unrollByVF(VPlan &Plan, ElementCount VF) {
469+
Type *IdxTy = IntegerType::get(
470+
Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
471+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
472+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
473+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
474+
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
475+
if (!RepR || RepR->isSingleScalar())
476+
continue;
477+
478+
VPBuilder Builder(RepR);
479+
SmallVector<VPValue *> LaneDefs;
480+
// Stores to invariant addresses only need to store the last lane.
481+
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
482+
vputils::isSingleScalar(RepR->getOperand(1))) {
483+
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF));
484+
RepR->eraseFromParent();
485+
continue;
486+
}
487+
488+
/// Create single-scalar version of RepR for all lanes.
489+
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
490+
LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
491+
492+
/// Users that only demand the first lane can use the definition for lane
493+
/// 0.
494+
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
495+
return U.onlyFirstLaneUsed(RepR);
496+
});
497+
498+
Type *ResTy = RepR->getUnderlyingInstr()->getType();
499+
// If needed, create a Build(Struct)Vector recipe to insert the scalar
500+
// lane values into a vector.
501+
if (!ResTy->isVoidTy()) {
502+
VPValue *VecRes = Builder.createNaryOp(
503+
ResTy->isStructTy() ? VPInstruction::BuildStructVector
504+
: VPInstruction::BuildVector,
505+
LaneDefs);
506+
RepR->replaceAllUsesWith(VecRes);
507+
}
508+
RepR->eraseFromParent();
509+
}
510+
}
511+
}

llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -398,12 +398,6 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
398398
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
399399
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3
400400
; CHECK-NEXT: store float [[TMP30]], ptr [[C:%.*]], align 4
401-
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0
402-
; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4
403-
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1
404-
; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4
405-
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2
406-
; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4
407401
; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3
408402
; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4
409403
; CHECK-NEXT: store float [[TMP36]], ptr [[B:%.*]], align 4

0 commit comments

Comments
 (0)