Skip to content

Commit 911055e

Browse files
authored
[VPlan] Consistently use (Part, 0) for first lane scalar values (#80271)
At the moment, some VPInstructions create only a single scalar value, but use VPTransformatState's 'vector' storage for this value. Those values are effectively uniform-per-VF (or in some cases uniform-across-VF-and-UF). Using the vector/per-part storage doesn't interact well with other recipes, that more accurately using (Part, Lane) to look up scalar values and prevents VPInstructions creating scalars from interacting with other recipes working with scalars. This PR tries to unify handling of scalars by using (Part, 0) for scalar values where only the first lane is demanded. This allows using VPInstructions with other recipes like VPScalarCastRecipe and is also needed when using VPInstructions in more cases otuside the vector loop region to generate scalars. Depends on #80269
1 parent 1865c7e commit 911055e

File tree

7 files changed

+79
-51
lines changed

7 files changed

+79
-51
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9127,7 +9127,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
91279127
"Unexpected type.");
91289128

91299129
auto *IVR = getParent()->getPlan()->getCanonicalIV();
9130-
PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9130+
PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
91319131

91329132
if (onlyScalarsGenerated(State.VF.isScalable())) {
91339133
// This is the normalized GEP that starts counting at zero.
@@ -9243,7 +9243,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
92439243

92449244
void VPReductionRecipe::execute(VPTransformState &State) {
92459245
assert(!State.Instance && "Reduction being replicated.");
9246-
Value *PrevInChain = State.get(getChainOp(), 0);
9246+
Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
92479247
RecurKind Kind = RdxDesc.getRecurrenceKind();
92489248
bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
92499249
// Propagate the fast-math flags carried by the underlying instruction.
@@ -9252,8 +9252,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
92529252
for (unsigned Part = 0; Part < State.UF; ++Part) {
92539253
Value *NewVecOp = State.get(getVecOp(), Part);
92549254
if (VPValue *Cond = getCondOp()) {
9255-
Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9256-
: State.get(Cond, {Part, 0});
9255+
Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
92579256
VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
92589257
Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
92599258
Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
@@ -9278,7 +9277,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
92789277
NewVecOp);
92799278
PrevInChain = NewRed;
92809279
} else {
9281-
PrevInChain = State.get(getChainOp(), Part);
9280+
PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
92829281
NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
92839282
}
92849283
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
@@ -9289,7 +9288,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
92899288
else
92909289
NextInChain = State.Builder.CreateBinOp(
92919290
(Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9292-
State.set(this, NextInChain, Part);
9291+
State.set(this, NextInChain, Part, /*IsScalar*/ true);
92939292
}
92949293
}
92959294

@@ -9404,7 +9403,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
94049403
// We don't want to update the value in the map as it might be used in
94059404
// another expression. So don't call resetVectorValue(StoredVal).
94069405
}
9407-
auto *VecPtr = State.get(getAddr(), Part);
9406+
auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
94089407
if (isMaskRequired)
94099408
NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
94109409
BlockInMaskParts[Part]);
@@ -9428,7 +9427,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
94289427
nullptr, "wide.masked.gather");
94299428
State.addMetadata(NewLI, LI);
94309429
} else {
9431-
auto *VecPtr = State.get(getAddr(), Part);
9430+
auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
94329431
if (isMaskRequired)
94339432
NewLI = Builder.CreateMaskedLoad(
94349433
DataTy, VecPtr, Alignment, BlockInMaskParts[Part],

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,16 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
242242
return Extract;
243243
}
244244

245-
Value *VPTransformState::get(VPValue *Def, unsigned Part) {
245+
Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
246+
if (NeedsScalar) {
247+
assert((VF.isScalar() || Def->isLiveIn() ||
248+
(hasScalarValue(Def, VPIteration(Part, 0)) &&
249+
Data.PerPartScalars[Def][Part].size() == 1)) &&
250+
"Trying to access a single scalar per part but has multiple scalars "
251+
"per part.");
252+
return get(Def, VPIteration(Part, 0));
253+
}
254+
246255
// If Values have been set for this Def return the one relevant for \p Part.
247256
if (hasVectorValue(Def, Part))
248257
return Data.PerPartOutput[Def][Part];
@@ -789,21 +798,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
789798
auto *TCMO = Builder.CreateSub(TripCountV,
790799
ConstantInt::get(TripCountV->getType(), 1),
791800
"trip.count.minus.1");
792-
auto VF = State.VF;
793-
Value *VTCMO =
794-
VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
795-
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
796-
State.set(BackedgeTakenCount, VTCMO, Part);
801+
BackedgeTakenCount->setUnderlyingValue(TCMO);
797802
}
798803

799-
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
800-
State.set(&VectorTripCount, VectorTripCountV, Part);
804+
VectorTripCount.setUnderlyingValue(VectorTripCountV);
801805

802806
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
803807
// FIXME: Model VF * UF computation completely in VPlan.
804-
State.set(&VFxUF,
805-
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF),
806-
0);
808+
VFxUF.setUnderlyingValue(
809+
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
807810

808811
// When vectorizing the epilogue loop, the canonical induction start value
809812
// needs to be changed from zero to the value after the main vector loop.
@@ -884,12 +887,16 @@ void VPlan::execute(VPTransformState *State) {
884887
isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
885888
(isa<VPReductionPHIRecipe>(PhiR) &&
886889
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
890+
bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
891+
(isa<VPReductionPHIRecipe>(PhiR) &&
892+
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
887893
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
888894

889895
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
890-
Value *Phi = State->get(PhiR, Part);
891-
Value *Val = State->get(PhiR->getBackedgeValue(),
892-
SinglePartNeeded ? State->UF - 1 : Part);
896+
Value *Phi = State->get(PhiR, Part, NeedsScalar);
897+
Value *Val =
898+
State->get(PhiR->getBackedgeValue(),
899+
SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
893900
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
894901
}
895902
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -259,9 +259,10 @@ struct VPTransformState {
259259
DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
260260
} Data;
261261

262-
/// Get the generated Value for the given VPValue \p Def and the given \p Part.
263-
/// \see set.
264-
Value *get(VPValue *Def, unsigned Part);
262+
/// Get the generated vector Value for a given VPValue \p Def and a given \p
263+
/// Part if \p IsScalar is false, otherwise return the generated scalar
264+
/// for \p Part. \See set.
265+
Value *get(VPValue *Def, unsigned Part, bool IsScalar = false);
265266

266267
/// Get the generated Value for a given VPValue and given Part and Lane.
267268
Value *get(VPValue *Def, const VPIteration &Instance);
@@ -282,14 +283,22 @@ struct VPTransformState {
282283
I->second[Instance.Part][CacheIdx];
283284
}
284285

285-
/// Set the generated Value for a given VPValue and a given Part.
286-
void set(VPValue *Def, Value *V, unsigned Part) {
286+
/// Set the generated vector Value for a given VPValue and a given Part, if \p
287+
/// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0).
288+
void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) {
289+
if (IsScalar) {
290+
set(Def, V, VPIteration(Part, 0));
291+
return;
292+
}
293+
assert((VF.isScalar() || V->getType()->isVectorTy()) &&
294+
"scalar values must be stored as (Part, 0)");
287295
if (!Data.PerPartOutput.count(Def)) {
288296
DataState::PerPartValuesTy Entry(UF);
289297
Data.PerPartOutput[Def] = Entry;
290298
}
291299
Data.PerPartOutput[Def][Part] = V;
292300
}
301+
293302
/// Reset an existing vector value for \p Def and a given \p Part.
294303
void reset(VPValue *Def, Value *V, unsigned Part) {
295304
auto Iter = Data.PerPartOutput.find(Def);
@@ -1376,6 +1385,13 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
13761385

13771386
/// Returns the result type of the cast.
13781387
Type *getResultType() const { return ResultTy; }
1388+
1389+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1390+
// At the moment, only uniform codegen is implemented.
1391+
assert(is_contained(operands(), Op) &&
1392+
"Op must be an operand of the recipe");
1393+
return true;
1394+
}
13791395
};
13801396

13811397
/// A recipe for widening Call instructions.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,12 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
279279
Builder.SetCurrentDebugLocation(getDebugLoc());
280280

281281
if (Instruction::isBinaryOp(getOpcode())) {
282+
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
282283
if (Part != 0 && vputils::onlyFirstPartUsed(this))
283-
return State.get(this, 0);
284+
return State.get(this, 0, OnlyFirstLaneUsed);
284285

285-
Value *A = State.get(getOperand(0), Part);
286-
Value *B = State.get(getOperand(1), Part);
286+
Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
287+
Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
287288
auto *Res =
288289
Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
289290
if (auto *I = dyn_cast<Instruction>(Res))
@@ -385,8 +386,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
385386
if (Part != 0)
386387
return nullptr;
387388
// First create the compare.
388-
Value *IV = State.get(getOperand(0), Part);
389-
Value *TC = State.get(getOperand(1), Part);
389+
Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
390+
Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
390391
Value *Cond = Builder.CreateICmpEQ(IV, TC);
391392

392393
// Now create the branch.
@@ -407,7 +408,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
407408
}
408409
case VPInstruction::ComputeReductionResult: {
409410
if (Part != 0)
410-
return State.get(this, 0);
411+
return State.get(this, 0, /*IsScalar*/ true);
411412

412413
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
413414
// and will be removed by breaking up the recipe further.
@@ -424,7 +425,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
424425
Type *PhiTy = OrigPhi->getType();
425426
VectorParts RdxParts(State.UF);
426427
for (unsigned Part = 0; Part < State.UF; ++Part)
427-
RdxParts[Part] = State.get(LoopExitingDef, Part);
428+
RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
428429

429430
// If the vector reduction can be performed in a smaller type, we truncate
430431
// then extend the loop exit value to enable InstCombine to evaluate the
@@ -512,9 +513,15 @@ void VPInstruction::execute(VPTransformState &State) {
512513
if (!hasResult())
513514
continue;
514515
assert(GeneratedValue && "generateInstruction must produce a value");
515-
State.set(this, GeneratedValue, Part);
516+
517+
bool IsVector = GeneratedValue->getType()->isVectorTy();
518+
State.set(this, GeneratedValue, Part, !IsVector);
519+
assert((IsVector || getOpcode() == VPInstruction::ComputeReductionResult ||
520+
State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
521+
"scalar value but not only first lane used");
516522
}
517523
}
524+
518525
bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
519526
assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
520527
if (Instruction::isBinaryOp(getOpcode()))
@@ -530,8 +537,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
530537
case VPInstruction::CalculateTripCountMinusVF:
531538
case VPInstruction::CanonicalIVIncrementForPart:
532539
case VPInstruction::BranchOnCount:
533-
// TODO: Cover additional operands.
534-
return getOperand(0) == Op;
540+
return true;
535541
};
536542
llvm_unreachable("switch should return");
537543
}
@@ -1344,7 +1350,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
13441350
PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
13451351
}
13461352

1347-
State.set(this, PartPtr, Part);
1353+
State.set(this, PartPtr, Part, /*IsScalar*/ true);
13481354
}
13491355
}
13501356

@@ -1640,7 +1646,7 @@ void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
16401646
EntryPart->addIncoming(Start, VectorPH);
16411647
EntryPart->setDebugLoc(getDebugLoc());
16421648
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1643-
State.set(this, EntryPart, Part);
1649+
State.set(this, EntryPart, Part, /*IsScalar*/ true);
16441650
}
16451651

16461652
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1711,7 +1717,7 @@ void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
17111717
#endif
17121718

17131719
void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1714-
Value *CanonicalIV = State.get(getOperand(0), 0);
1720+
Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
17151721
Type *STy = CanonicalIV->getType();
17161722
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
17171723
ElementCount VF = State.VF;
@@ -1801,7 +1807,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
18011807
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
18021808
Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
18031809
EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
1804-
State.set(this, EntryPart, Part);
1810+
State.set(this, EntryPart, Part, IsInLoop);
18051811
}
18061812

18071813
BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
@@ -1833,7 +1839,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
18331839
}
18341840

18351841
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1836-
Value *EntryPart = State.get(this, Part);
1842+
Value *EntryPart = State.get(this, Part, IsInLoop);
18371843
// Make sure to add the reduction start value only to the
18381844
// first unroll part.
18391845
Value *StartVal = (Part == 0) ? StartV : Iden;

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,6 @@ class VPValue {
7373
// for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
7474
// back-end and analysis information for the new IR.
7575

76-
// Set \p Val as the underlying Value of this VPValue.
77-
void setUnderlyingValue(Value *Val) {
78-
assert(!UnderlyingVal && "Underlying Value is already set.");
79-
UnderlyingVal = Val;
80-
}
81-
8276
public:
8377
/// Return the underlying Value attached to this VPValue.
8478
Value *getUnderlyingValue() { return UnderlyingVal; }
@@ -192,6 +186,12 @@ class VPValue {
192186
/// is a live-in value.
193187
/// TODO: Also handle recipes defined in pre-header blocks.
194188
bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); }
189+
190+
// Set \p Val as the underlying Value of this VPValue.
191+
void setUnderlyingValue(Value *Val) {
192+
assert(!UnderlyingVal && "Underlying Value is already set.");
193+
UnderlyingVal = Val;
194+
}
195195
};
196196

197197
typedef DenseMap<Value *, VPValue *> Value2VPValueTy;

llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,10 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
117117
; DATA_NO_LANEMASK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
118118
; DATA_NO_LANEMASK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
119119
; DATA_NO_LANEMASK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
120-
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
121-
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
122120
; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
123121
; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
122+
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
123+
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
124124
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
125125
; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
126126
; DATA_NO_LANEMASK-NEXT: br label [[VECTOR_BODY:%.*]]

llvm/test/Transforms/LoopVectorize/X86/small-size.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ define void @example2(i32 %n, i32 %x) optsize {
142142
; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_116]], i64 0
143143
; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
144144
; CHECK-NEXT: br label [[VECTOR_BODY19:%.*]]
145-
; CHECK: vector.body19:
145+
; CHECK: vector.body17:
146146
; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT31:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
147147
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX20]]
148148
; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0

0 commit comments

Comments
 (0)