Skip to content

Commit ee468ec

Browse files
[LoopVectorize][AArch64][SVE] Generate wide active lane masks
This patch makes the LoopVectorize generate lane masks longer than the VF to allow the target to better utilise the instruction set. The vectorizer emit one or more wide `llvm.get.active.lane.mask.*` calls plus several `llvm.vector.extract.*` calls to yield the required number of VF-wide masks. The motivating exammple is a vectorised loop with unroll factor 2 that can use the SVE2.1 `whilelo` instruction with predicate pair result, or a SVE `whilelo` instruction with smaller element size plus `punpklo`/`punpkhi`. How wide is the lane mask that the vectoriser emits is controlled by a TargetTransformInfo hook `getMaxPredicateLength`.The default impementation (return the same length as the VF) keeps the change non-functional for targets that can't or are not prepared to handle wider lane masks.
1 parent ae86278 commit ee468ec

23 files changed

+3235
-1336
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1244,6 +1244,8 @@ class TargetTransformInfo {
12441244
/// and the number of execution units in the CPU.
12451245
unsigned getMaxInterleaveFactor(ElementCount VF) const;
12461246

1247+
ElementCount getMaxPredicateLength(ElementCount VF) const;
1248+
12471249
/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
12481250
static OperandValueInfo getOperandInfo(const Value *V);
12491251

@@ -2002,6 +2004,9 @@ class TargetTransformInfo::Concept {
20022004
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
20032005

20042006
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
2007+
2008+
virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;
2009+
20052010
virtual InstructionCost getArithmeticInstrCost(
20062011
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
20072012
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
@@ -2627,6 +2632,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
26272632
unsigned getMaxInterleaveFactor(ElementCount VF) override {
26282633
return Impl.getMaxInterleaveFactor(VF);
26292634
}
2635+
2636+
ElementCount getMaxPredicateLength(ElementCount VF) const override {
2637+
return Impl.getMaxPredicateLength(VF);
2638+
}
2639+
26302640
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
26312641
unsigned &JTSize,
26322642
ProfileSummaryInfo *PSI,

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,8 @@ class TargetTransformInfoImplBase {
537537

538538
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
539539

540+
ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
541+
540542
InstructionCost getArithmeticInstrCost(
541543
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
542544
TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
890890

891891
unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
892892

893+
ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
894+
893895
InstructionCost getArithmeticInstrCost(
894896
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
895897
TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
821821
return TTIImpl->getMaxInterleaveFactor(VF);
822822
}
823823

824+
ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
825+
return TTIImpl->getMaxPredicateLength(VF);
826+
}
827+
824828
TargetTransformInfo::OperandValueInfo
825829
TargetTransformInfo::getOperandInfo(const Value *V) {
826830
OperandValueKind OpInfo = OK_AnyValue;

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3383,6 +3383,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
33833383
return ST->getMaxInterleaveFactor();
33843384
}
33853385

3386+
ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
3387+
// Do not create masks bigger than `<vscale x 16 x i1>`.
3388+
unsigned N = ST->hasSVE() ? 16 : 0;
3389+
// Do not create masks that are more than twice the VF.
3390+
N = std::min(N, 2 * VF.getKnownMinValue());
3391+
return VF.isScalable() ? ElementCount::getScalable(N)
3392+
: ElementCount::getFixed(N);
3393+
}
3394+
33863395
// For Falkor, we want to avoid having too many strided loads in a loop since
33873396
// that can exhaust the HW prefetcher resources. We adjust the unroller
33883397
// MaxCount preference below to attempt to ensure unrolling doesn't create too

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
157157

158158
unsigned getMaxInterleaveFactor(ElementCount VF);
159159

160+
ElementCount getMaxPredicateLength(ElementCount VF) const;
161+
160162
bool prefersVectorizedAddressing() const;
161163

162164
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,14 @@ class VPBuilder {
202202
VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
203203
DebugLoc DL = {}, const Twine &Name = "");
204204

205+
VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
206+
const Twine &Name = "") {
207+
auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
208+
if (BB)
209+
BB->insert(ALM, InsertPt);
210+
return ALM;
211+
}
212+
205213
//===--------------------------------------------------------------------===//
206214
// RAII helpers.
207215
//===--------------------------------------------------------------------===//

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,10 @@ class InnerLoopVectorizer {
588588
/// count of the original loop for both main loop and epilogue vectorization.
589589
void setTripCount(Value *TC) { TripCount = TC; }
590590

591+
ElementCount getMaxPredicateLength(ElementCount VF) const {
592+
return TTI->getMaxPredicateLength(VF);
593+
}
594+
591595
protected:
592596
friend class LoopVectorizationPlanner;
593597

@@ -7509,7 +7513,8 @@ LoopVectorizationPlanner::executePlan(
75097513
LLVM_DEBUG(BestVPlan.dump());
75107514

75117515
// Perform the actual loop transformation.
7512-
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7516+
VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
7517+
DT, ILV.Builder, &ILV, &BestVPlan,
75137518
OrigLoop->getHeader()->getContext());
75147519

75157520
// 0. Generate SCEV-dependent code into the preheader, including TripCount,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,12 +215,13 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
215215
return It;
216216
}
217217

218-
VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
218+
VPTransformState::VPTransformState(ElementCount VF, unsigned UF,
219+
ElementCount MaxPred, LoopInfo *LI,
219220
DominatorTree *DT, IRBuilderBase &Builder,
220221
InnerLoopVectorizer *ILV, VPlan *Plan,
221222
LLVMContext &Ctx)
222-
: VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
223-
LVer(nullptr),
223+
: VF(VF), UF(UF), MaxPred(MaxPred), CFG(DT), LI(LI), Builder(Builder),
224+
ILV(ILV), Plan(Plan), LVer(nullptr),
224225
TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}
225226

226227
Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,13 +235,14 @@ struct VPIteration {
235235
/// VPTransformState holds information passed down when "executing" a VPlan,
236236
/// needed for generating the output IR.
237237
struct VPTransformState {
238-
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
239-
DominatorTree *DT, IRBuilderBase &Builder,
238+
VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred,
239+
LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
240240
InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx);
241241

242242
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
243243
ElementCount VF;
244244
unsigned UF;
245+
ElementCount MaxPred;
245246

246247
/// Hold the indices to generate specific scalar instructions. Null indicates
247248
/// that all instances are to be generated, using either scalar or vector
@@ -1174,7 +1175,6 @@ class VPInstruction : public VPRecipeWithIRFlags {
11741175
Not,
11751176
SLPLoad,
11761177
SLPStore,
1177-
ActiveLaneMask,
11781178
ExplicitVectorLength,
11791179
CalculateTripCountMinusVF,
11801180
// Increment the canonical IV separately for each unrolled part.
@@ -1329,6 +1329,50 @@ class VPInstruction : public VPRecipeWithIRFlags {
13291329
}
13301330
};
13311331

1332+
class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags {
1333+
const std::string Name;
1334+
1335+
public:
1336+
VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, DebugLoc DL = {},
1337+
const Twine &Name = "")
1338+
: VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC,
1339+
std::initializer_list<VPValue *>{IV, TC}, DL),
1340+
Name(Name.str()) {}
1341+
1342+
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC)
1343+
1344+
VPActiveLaneMaskRecipe *clone() override {
1345+
SmallVector<VPValue *, 2> Operands(operands());
1346+
assert(Operands.size() == 2 && "by construction");
1347+
auto *New = new VPActiveLaneMaskRecipe(Operands[0], Operands[1],
1348+
getDebugLoc(), Name);
1349+
New->transferFlags(*this);
1350+
return New;
1351+
}
1352+
1353+
void execute(VPTransformState &State) override;
1354+
1355+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1356+
/// Print the recipe.
1357+
void print(raw_ostream &O, const Twine &Indent,
1358+
VPSlotTracker &SlotTracker) const override;
1359+
#endif
1360+
1361+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1362+
assert(is_contained(operands(), Op) &&
1363+
"Op must be an operand of the recipe");
1364+
1365+
return true;
1366+
}
1367+
1368+
bool onlyFirstPartUsed(const VPValue *Op) const override {
1369+
assert(is_contained(operands(), Op) &&
1370+
"Op must be an operand of the recipe");
1371+
1372+
return false;
1373+
}
1374+
};
1375+
13321376
/// VPWidenRecipe is a recipe for producing a copy of vector type its
13331377
/// ingredient. This recipe covers most of the traditional vectorization cases
13341378
/// where each ingredient transforms into a vectorized version of itself.

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -221,11 +221,6 @@ m_BranchOnCond(const Op0_t &Op0) {
221221
return m_VPInstruction<VPInstruction::BranchOnCond>(Op0);
222222
}
223223

224-
template <typename Op0_t, typename Op1_t>
225-
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
226-
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
227-
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
228-
}
229224

230225
template <typename Op0_t, typename Op1_t>
231226
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
@@ -296,6 +291,35 @@ inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::LogicalAnd>
296291
m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
297292
return m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1);
298293
}
294+
295+
template <typename Op0_t, typename Op1_t>
296+
struct VPActiveLaneMask_match {
297+
Op0_t Op0;
298+
Op1_t Op1;
299+
300+
VPActiveLaneMask_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
301+
302+
bool match(const VPValue *V) {
303+
auto *DefR = V->getDefiningRecipe();
304+
return DefR && match(DefR);
305+
}
306+
307+
bool match(const VPRecipeBase *R) {
308+
auto *DefR = dyn_cast<VPActiveLaneMaskRecipe>(R);
309+
if (!DefR)
310+
return false;
311+
assert(DefR->getNumOperands() == 2 &&
312+
"recipe with matched opcode does not have 2 operands");
313+
return Op0.match(DefR->getOperand(0)) && Op1.match(DefR->getOperand(1));
314+
}
315+
};
316+
317+
template <typename Op0_t, typename Op1_t>
318+
inline VPActiveLaneMask_match<Op0_t, Op1_t>
319+
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
320+
return {Op0, Op1};
321+
}
322+
299323
} // namespace VPlanPatternMatch
300324
} // namespace llvm
301325

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 86 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -351,24 +351,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
351351
Value *Op2 = State.get(getOperand(2), Part);
352352
return Builder.CreateSelect(Cond, Op1, Op2, Name);
353353
}
354-
case VPInstruction::ActiveLaneMask: {
355-
// Get first lane of vector induction variable.
356-
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
357-
// Get the original loop tripcount.
358-
Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
359354

360-
// If this part of the active lane mask is scalar, generate the CMP directly
361-
// to avoid unnecessary extracts.
362-
if (State.VF.isScalar())
363-
return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
364-
Name);
365-
366-
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
367-
auto *PredTy = VectorType::get(Int1Ty, State.VF);
368-
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
369-
{PredTy, ScalarTC->getType()},
370-
{VIVElem0, ScalarTC}, nullptr, Name);
371-
}
372355
case VPInstruction::FirstOrderRecurrenceSplice: {
373356
// Generate code to combine the previous and current values in vector v3.
374357
//
@@ -636,7 +619,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
636619
case VPInstruction::PtrAdd:
637620
// TODO: Cover additional opcodes.
638621
return vputils::onlyFirstLaneUsed(this);
639-
case VPInstruction::ActiveLaneMask:
640622
case VPInstruction::ExplicitVectorLength:
641623
case VPInstruction::CalculateTripCountMinusVF:
642624
case VPInstruction::CanonicalIVIncrementForPart:
@@ -671,9 +653,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
671653
case VPInstruction::SLPStore:
672654
O << "combined store";
673655
break;
674-
case VPInstruction::ActiveLaneMask:
675-
O << "active lane mask";
676-
break;
677656
case VPInstruction::ExplicitVectorLength:
678657
O << "EXPLICIT-VECTOR-LENGTH";
679658
break;
@@ -713,8 +692,94 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
713692
DL.print(O);
714693
}
715694
}
695+
696+
void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
697+
VPSlotTracker &SlotTracker) const {
698+
O << Indent << "EMIT ";
699+
700+
printAsOperand(O, SlotTracker);
701+
O << " = active lane mask";
702+
printFlags(O);
703+
printOperands(O, SlotTracker);
704+
705+
if (auto DL = getDebugLoc()) {
706+
O << ", !dbg ";
707+
DL.print(O);
708+
}
709+
}
710+
716711
#endif
717712

713+
void VPActiveLaneMaskRecipe::execute(VPTransformState &State) {
714+
assert(!State.Instance && "VPInstruction executing an Instance");
715+
716+
IRBuilderBase &Builder = State.Builder;
717+
Builder.SetCurrentDebugLocation(getDebugLoc());
718+
719+
// If this the active lane mask is scalar, generate the CMP directly
720+
// to avoid unnecessary extracts.
721+
if (State.VF.isScalar()) {
722+
for (int Part = State.UF - 1; Part >= 0; --Part) {
723+
// Get first lane of vector induction variable.
724+
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
725+
// Get the original loop tripcount.
726+
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
727+
728+
Value *V = Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0,
729+
ScalarTC, Name);
730+
State.set(this, V, Part);
731+
}
732+
return;
733+
}
734+
735+
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
736+
auto *PredTy = VectorType::get(Int1Ty, State.VF);
737+
738+
unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(),
739+
State.UF * State.VF.getKnownMinValue());
740+
if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() ||
741+
MaxPred % State.VF.getKnownMinValue() != 0) {
742+
for (int Part = State.UF - 1; Part >= 0; --Part) {
743+
// Get first lane of vector induction variable.
744+
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
745+
// Get the original loop tripcount.
746+
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
747+
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
748+
{PredTy, ScalarTC->getType()},
749+
{VIVElem0, ScalarTC}, nullptr, Name);
750+
State.set(this, V, Part);
751+
}
752+
return;
753+
}
754+
755+
// Generate long active lane masks covering all the unrolled iterations.
756+
unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue();
757+
auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable());
758+
SmallVector<Value *> LongMask(State.UF / PartsPerMask, nullptr);
759+
for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) {
760+
// Get first lane of vector induction variable.
761+
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
762+
// Get the original loop tripcount.
763+
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
764+
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
765+
{LongPredTy, ScalarTC->getType()},
766+
{VIVElem0, ScalarTC}, nullptr, Name);
767+
LongMask[Part / PartsPerMask] = V;
768+
}
769+
770+
for (int Part = State.UF - 1; Part >= 0; --Part) {
771+
Value *ALM = LongMask[Part / PartsPerMask];
772+
const unsigned I = Part % PartsPerMask;
773+
Value *V = Builder.CreateIntrinsic(
774+
Intrinsic::vector_extract, {PredTy, ALM->getType()},
775+
{ALM, ConstantInt::get(Type::getInt64Ty(Builder.getContext()),
776+
I * State.VF.getKnownMinValue())},
777+
nullptr, Name);
778+
779+
State.set(this, V, Part);
780+
}
781+
}
782+
718783
void VPWidenCallRecipe::execute(VPTransformState &State) {
719784
assert(State.VF.isVector() && "not widening");
720785
Function *CalledScalarFn = getCalledScalarFunction();

0 commit comments

Comments
 (0)