Skip to content

Commit c354bdc

Browse files
committed
[VPlan] Unroll VPRedplicateRecipes by VF.
1 parent 7119a0f commit c354bdc

16 files changed

+194
-69
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7291,6 +7291,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72917291
// cost model is complete for better cost estimates.
72927292
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
72937293
OrigLoop->getHeader()->getContext());
7294+
VPlanTransforms::runPass(VPlanTransforms::unrollByVF, BestVPlan, BestVF);
72947295
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
72957296
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
72967297
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,14 @@ Value *VPTransformState::get(const VPValue *Def, const VPLane &Lane) {
261261
return Data.VPV2Scalars[Def][0];
262262
}
263263

264+
// Look through BuildVector to avoid redundant extracts.
265+
// TODO: Remove once replicate regions are unrolled explicitly.
266+
auto *BV = dyn_cast<VPInstruction>(Def);
267+
if (Lane.getKind() == VPLane::Kind::First && BV &&
268+
BV->getOpcode() == VPInstruction::BuildVector) {
269+
return get(BV->getOperand(Lane.getKnownLane()), true);
270+
}
271+
264272
assert(hasVectorValue(Def));
265273
auto *VecPart = Data.VPV2Vector[Def];
266274
if (!VecPart->getType()->isVectorTy()) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,12 @@ class VPInstruction : public VPRecipeWithIRFlags,
907907
BranchOnCount,
908908
BranchOnCond,
909909
Broadcast,
910+
/// Creates a vector containing all operands. The vector element count
911+
/// matches the number of operands.
912+
BuildVector,
913+
/// Creates a struct of vectors containing all operands. The vector element
914+
/// count matches the number of operands.
915+
BuildStructVector,
910916
ComputeAnyOfResult,
911917
ComputeFindLastIVResult,
912918
ComputeReductionResult,

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
107107
case VPInstruction::CalculateTripCountMinusVF:
108108
case VPInstruction::CanonicalIVIncrementForPart:
109109
case VPInstruction::AnyOf:
110+
case VPInstruction::BuildVector:
111+
case VPInstruction::BuildStructVector:
110112
return SetResultTyFromOp();
111113
case VPInstruction::FirstActiveLane:
112114
return Type::getIntNTy(Ctx, 64);

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
493493
}
494494
case Instruction::ExtractElement: {
495495
assert(State.VF.isVector() && "Only extract elements from vectors");
496+
return State.get(getOperand(0),
497+
VPLane(cast<ConstantInt>(getOperand(1)->getLiveInIRValue())
498+
->getZExtValue()));
496499
Value *Vec = State.get(getOperand(0));
497500
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
498501
return Builder.CreateExtractElement(Vec, Idx, Name);
@@ -604,6 +607,33 @@ Value *VPInstruction::generate(VPTransformState &State) {
604607
return Builder.CreateVectorSplat(
605608
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
606609
}
610+
case VPInstruction::BuildVector: {
611+
auto *ScalarTy = State.TypeAnalysis.inferScalarType(getOperand(0));
612+
Value *Res = PoisonValue::get(
613+
toVectorizedTy(ScalarTy, ElementCount::getFixed(getNumOperands())));
614+
for (const auto &[Idx, Op] : enumerate(operands()))
615+
Res = State.Builder.CreateInsertElement(Res, State.get(Op, true),
616+
State.Builder.getInt32(Idx));
617+
return Res;
618+
}
619+
case VPInstruction::BuildStructVector: {
620+
// For struct types, we need to build a new 'wide' struct type, where each
621+
// element is widened.
622+
auto *STy =
623+
cast<StructType>(State.TypeAnalysis.inferScalarType(getOperand(0)));
624+
Value *Res = PoisonValue::get(
625+
toVectorizedTy(STy, ElementCount::getFixed(getNumOperands())));
626+
for (const auto &[Idx, Op] : enumerate(operands())) {
627+
for (unsigned I = 0, E = STy->getNumElements(); I != E; I++) {
628+
Value *ScalarValue = Builder.CreateExtractValue(State.get(Op, true), I);
629+
Value *VectorValue = Builder.CreateExtractValue(Res, I);
630+
VectorValue =
631+
Builder.CreateInsertElement(VectorValue, ScalarValue, Idx);
632+
Res = Builder.CreateInsertValue(Res, VectorValue, I);
633+
}
634+
}
635+
return Res;
636+
}
607637
case VPInstruction::ComputeAnyOfResult: {
608638
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
609639
// and will be removed by breaking up the recipe further.
@@ -872,10 +902,11 @@ void VPInstruction::execute(VPTransformState &State) {
872902
if (!hasResult())
873903
return;
874904
assert(GeneratedValue && "generate must produce a value");
875-
assert(
876-
(GeneratedValue->getType()->isVectorTy() == !GeneratesPerFirstLaneOnly ||
877-
State.VF.isScalar()) &&
878-
"scalar value but not only first lane defined");
905+
assert((((GeneratedValue->getType()->isVectorTy() ||
906+
GeneratedValue->getType()->isStructTy()) ==
907+
!GeneratesPerFirstLaneOnly) ||
908+
State.VF.isScalar()) &&
909+
"scalar value but not only first lane defined");
879910
State.set(this, GeneratedValue,
880911
/*IsScalar*/ GeneratesPerFirstLaneOnly);
881912
}
@@ -889,6 +920,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
889920
case Instruction::ICmp:
890921
case Instruction::Select:
891922
case VPInstruction::AnyOf:
923+
case VPInstruction::BuildVector:
924+
case VPInstruction::BuildStructVector:
892925
case VPInstruction::CalculateTripCountMinusVF:
893926
case VPInstruction::CanonicalIVIncrementForPart:
894927
case VPInstruction::ExtractLastElement:
@@ -1008,6 +1041,12 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
10081041
case VPInstruction::Broadcast:
10091042
O << "broadcast";
10101043
break;
1044+
case VPInstruction::BuildVector:
1045+
O << "buildvector";
1046+
break;
1047+
case VPInstruction::BuildStructVector:
1048+
O << "buildstructvector";
1049+
break;
10111050
case VPInstruction::ExtractLastElement:
10121051
O << "extract-last-element";
10131052
break;
@@ -2763,20 +2802,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
27632802
scalarizeInstruction(UI, this, VPLane(0), State);
27642803
return;
27652804
}
2766-
2767-
// A store of a loop varying value to a uniform address only needs the last
2768-
// copy of the store.
2769-
if (isa<StoreInst>(UI) && vputils::isSingleScalar(getOperand(1))) {
2770-
auto Lane = VPLane::getLastLaneForVF(State.VF);
2771-
scalarizeInstruction(UI, this, VPLane(Lane), State);
2772-
return;
2773-
}
2774-
2775-
// Generate scalar instances for all VF lanes.
2776-
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
2777-
const unsigned EndLane = State.VF.getKnownMinValue();
2778-
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
2779-
scalarizeInstruction(UI, this, VPLane(Lane), State);
27802805
}
27812806

27822807
bool VPReplicateRecipe::shouldPack() const {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,6 +1140,22 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
11401140
return;
11411141
}
11421142

1143+
// Look through Extract(Last|Penultimate)Element (BuildVector ....).
1144+
if (match(&R,
1145+
m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) ||
1146+
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1147+
m_VPValue(A)))) {
1148+
unsigned Offset = cast<VPInstruction>(&R)->getOpcode() ==
1149+
VPInstruction::ExtractLastElement
1150+
? 1
1151+
: 2;
1152+
auto *BV = dyn_cast<VPInstruction>(A);
1153+
if (BV && BV->getOpcode() == VPInstruction::BuildVector) {
1154+
Def->replaceAllUsesWith(BV->getOperand(BV->getNumOperands() - Offset));
1155+
return;
1156+
}
1157+
}
1158+
11431159
// Some simplifications can only be applied after unrolling. Perform them
11441160
// below.
11451161
if (!Plan->isUnrolled())

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ struct VPlanTransforms {
9999
/// Explicitly unroll \p Plan by \p UF.
100100
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx);
101101

102+
/// Explicitly unroll VPReplicateRecipes outside of replicate regions by \p
103+
/// VF.
104+
static void unrollByVF(VPlan &Plan, ElementCount VF);
105+
102106
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
103107
/// resulting plan to \p BestVF and \p BestUF.
104108
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "VPlan.h"
1616
#include "VPlanAnalysis.h"
1717
#include "VPlanCFG.h"
18+
#include "VPlanHelpers.h"
1819
#include "VPlanPatternMatch.h"
1920
#include "VPlanTransforms.h"
2021
#include "VPlanUtils.h"
@@ -430,3 +431,83 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
430431

431432
VPlanTransforms::removeDeadRecipes(Plan);
432433
}
434+
435+
/// Create a single-scalar clone of RepR for lane \p Lane.
436+
static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
437+
Type *IdxTy, VPReplicateRecipe *RepR,
438+
VPLane Lane) {
439+
// Collect the operands at Lane, creating extracts as needed.
440+
SmallVector<VPValue *> NewOps;
441+
for (VPValue *Op : RepR->operands()) {
442+
if (vputils::isSingleScalar(Op)) {
443+
NewOps.push_back(Op);
444+
continue;
445+
}
446+
VPValue *Ext;
447+
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
448+
Ext = Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op});
449+
} else {
450+
// Look through buildvector to avoid unnecessary extracts.
451+
auto *BV = dyn_cast<VPInstruction>(Op);
452+
if (BV && BV->getOpcode() == VPInstruction::BuildVector) {
453+
NewOps.push_back(BV->getOperand(Lane.getKnownLane()));
454+
continue;
455+
}
456+
VPValue *Idx =
457+
Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
458+
Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
459+
}
460+
NewOps.push_back(Ext);
461+
}
462+
463+
auto *New =
464+
new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
465+
/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);
466+
New->insertBefore(RepR);
467+
return New;
468+
}
469+
470+
void VPlanTransforms::unrollByVF(VPlan &Plan, ElementCount VF) {
471+
Type *IdxTy = IntegerType::get(
472+
Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
473+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
474+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
475+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
476+
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
477+
if (!RepR || RepR->isSingleScalar())
478+
continue;
479+
480+
VPBuilder Builder(RepR);
481+
SmallVector<VPValue *> LaneDefs;
482+
// Stores to invariant addresses only need to store the last lane.
483+
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
484+
vputils::isSingleScalar(RepR->getOperand(1))) {
485+
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF));
486+
RepR->eraseFromParent();
487+
continue;
488+
}
489+
490+
/// Create single-scalar version of RepR for all lanes.
491+
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
492+
LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
493+
494+
/// Users that only demand the first lane can use the definition for lane
495+
/// 0.
496+
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
497+
return U.onlyFirstLaneUsed(RepR);
498+
});
499+
500+
Type *ResTy = RepR->getUnderlyingInstr()->getType();
501+
// If needed, create a Build(Struct)Vector recipe to insert the scalar
502+
// lane values into a vector.
503+
if (!ResTy->isVoidTy()) {
504+
VPValue *VecRes = Builder.createNaryOp(
505+
ResTy->isStructTy() ? VPInstruction::BuildStructVector
506+
: VPInstruction::BuildVector,
507+
LaneDefs);
508+
RepR->replaceAllUsesWith(VecRes);
509+
}
510+
RepR->eraseFromParent();
511+
}
512+
}
513+
}

llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -398,12 +398,6 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
398398
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
399399
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3
400400
; CHECK-NEXT: store float [[TMP30]], ptr [[C:%.*]], align 4
401-
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 0
402-
; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP31]], align 4
403-
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 1
404-
; CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[TMP33]], align 4
405-
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 2
406-
; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP35]], align 4
407401
; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x ptr> [[TMP29]], i32 3
408402
; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP37]], align 4
409403
; CHECK-NEXT: store float [[TMP36]], ptr [[B:%.*]], align 4

0 commit comments

Comments
 (0)