Skip to content

Commit b65dbdf

Browse files
[LoopVectorize][AArch64][SVE] Generate wide active lane masks
This patch makes the LoopVectorize generate lane masks longer than the VF to allow the target to better utilise the instruction set. The vectorizer emit one or more wide `llvm.get.active.lane.mask.*` calls plus several `llvm.vector.extract.*` calls to yield the required number of VF-wide masks. The motivating exammple is a vectorised loop with unroll factor 2 that can use the SVE2.1 `whilelo` instruction with predicate pair result, or a SVE `whilelo` instruction with smaller element size plus `punpklo`/`punpkhi`. How wide is the lane mask that the vectoriser emits is controlled by a TargetTransformInfo hook `getMaxPredicateLength`.The default impementation (return the same length as the VF) keeps the change non-functional for targets that can't or are not prepared to handle wider lane masks.
1 parent 28cdbbe commit b65dbdf

21 files changed

+3253
-1388
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,6 +1228,8 @@ class TargetTransformInfo {
12281228
/// and the number of execution units in the CPU.
12291229
unsigned getMaxInterleaveFactor(ElementCount VF) const;
12301230

1231+
ElementCount getMaxPredicateLength(ElementCount VF) const;
1232+
12311233
/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
12321234
static OperandValueInfo getOperandInfo(const Value *V);
12331235

@@ -1981,6 +1983,9 @@ class TargetTransformInfo::Concept {
19811983
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
19821984

19831985
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
1986+
1987+
virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;
1988+
19841989
virtual InstructionCost getArithmeticInstrCost(
19851990
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
19861991
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
@@ -2601,6 +2606,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
26012606
unsigned getMaxInterleaveFactor(ElementCount VF) override {
26022607
return Impl.getMaxInterleaveFactor(VF);
26032608
}
2609+
2610+
ElementCount getMaxPredicateLength(ElementCount VF) const override {
2611+
return Impl.getMaxPredicateLength(VF);
2612+
}
2613+
26042614
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
26052615
unsigned &JTSize,
26062616
ProfileSummaryInfo *PSI,

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,8 @@ class TargetTransformInfoImplBase {
524524

525525
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
526526

527+
ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
528+
527529
InstructionCost getArithmeticInstrCost(
528530
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
529531
TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -882,6 +882,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
882882

883883
unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
884884

885+
ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
886+
885887
InstructionCost getArithmeticInstrCost(
886888
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
887889
TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
808808
return TTIImpl->getMaxInterleaveFactor(VF);
809809
}
810810

811+
ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
812+
return TTIImpl->getMaxPredicateLength(VF);
813+
}
814+
811815
TargetTransformInfo::OperandValueInfo
812816
TargetTransformInfo::getOperandInfo(const Value *V) {
813817
OperandValueKind OpInfo = OK_AnyValue;

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3324,6 +3324,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
33243324
return ST->getMaxInterleaveFactor();
33253325
}
33263326

3327+
ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
3328+
// Do not create masks bigger than `<vscale x 16 x i1>`.
3329+
unsigned N = ST->hasSVE() ? 16 : 0;
3330+
// Do not create masks that are more than twice the VF.
3331+
N = std::min(N, 2 * VF.getKnownMinValue());
3332+
return VF.isScalable() ? ElementCount::getScalable(N)
3333+
: ElementCount::getFixed(N);
3334+
}
3335+
33273336
// For Falkor, we want to avoid having too many strided loads in a loop since
33283337
// that can exhaust the HW prefetcher resources. We adjust the unroller
33293338
// MaxCount preference below to attempt to ensure unrolling doesn't create too

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
157157

158158
unsigned getMaxInterleaveFactor(ElementCount VF);
159159

160+
ElementCount getMaxPredicateLength(ElementCount VF) const;
161+
160162
bool prefersVectorizedAddressing() const;
161163

162164
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,14 @@ class VPBuilder {
184184
VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
185185
DebugLoc DL = {}, const Twine &Name = "");
186186

187+
VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
188+
const Twine &Name = "") {
189+
auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
190+
if (BB)
191+
BB->insert(ALM, InsertPt);
192+
return ALM;
193+
}
194+
187195
//===--------------------------------------------------------------------===//
188196
// RAII helpers.
189197
//===--------------------------------------------------------------------===//

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,10 @@ class InnerLoopVectorizer {
594594
/// count of the original loop for both main loop and epilogue vectorization.
595595
void setTripCount(Value *TC) { TripCount = TC; }
596596

597+
ElementCount getMaxPredicateLength(ElementCount VF) const {
598+
return TTI->getMaxPredicateLength(VF);
599+
}
600+
597601
protected:
598602
friend class LoopVectorizationPlanner;
599603

@@ -7470,7 +7474,8 @@ LoopVectorizationPlanner::executePlan(
74707474
LLVM_DEBUG(BestVPlan.dump());
74717475

74727476
// Perform the actual loop transformation.
7473-
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7477+
VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
7478+
DT, ILV.Builder, &ILV, &BestVPlan,
74747479
OrigLoop->getHeader()->getContext());
74757480

74767481
// 0. Generate SCEV-dependent code into the preheader, including TripCount,

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,12 +212,13 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
212212
return It;
213213
}
214214

215-
VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
215+
VPTransformState::VPTransformState(ElementCount VF, unsigned UF,
216+
ElementCount MaxPred, LoopInfo *LI,
216217
DominatorTree *DT, IRBuilderBase &Builder,
217218
InnerLoopVectorizer *ILV, VPlan *Plan,
218219
LLVMContext &Ctx)
219-
: VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
220-
LVer(nullptr),
220+
: VF(VF), UF(UF), MaxPred(MaxPred), LI(LI), DT(DT), Builder(Builder),
221+
ILV(ILV), Plan(Plan), LVer(nullptr),
221222
TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}
222223

223224
Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,13 +234,14 @@ struct VPIteration {
234234
/// VPTransformState holds information passed down when "executing" a VPlan,
235235
/// needed for generating the output IR.
236236
struct VPTransformState {
237-
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
238-
DominatorTree *DT, IRBuilderBase &Builder,
237+
VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred,
238+
LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
239239
InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx);
240240

241241
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
242242
ElementCount VF;
243243
unsigned UF;
244+
ElementCount MaxPred;
244245

245246
/// Hold the indices to generate specific scalar instructions. Null indicates
246247
/// that all instances are to be generated, using either scalar or vector
@@ -1148,7 +1149,6 @@ class VPInstruction : public VPRecipeWithIRFlags {
11481149
Not,
11491150
SLPLoad,
11501151
SLPStore,
1151-
ActiveLaneMask,
11521152
CalculateTripCountMinusVF,
11531153
// Increment the canonical IV separately for each unrolled part.
11541154
CanonicalIVIncrementForPart,
@@ -1271,6 +1271,50 @@ class VPInstruction : public VPRecipeWithIRFlags {
12711271
}
12721272
};
12731273

1274+
class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags {
1275+
const std::string Name;
1276+
1277+
public:
1278+
VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, DebugLoc DL = {},
1279+
const Twine &Name = "")
1280+
: VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC,
1281+
std::initializer_list<VPValue *>{IV, TC}, DL),
1282+
Name(Name.str()) {}
1283+
1284+
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC)
1285+
1286+
VPRecipeBase *clone() override {
1287+
SmallVector<VPValue *, 2> Operands(operands());
1288+
assert(Operands.size() == 2 && "by construction");
1289+
auto *New = new VPActiveLaneMaskRecipe(Operands[0], Operands[1],
1290+
getDebugLoc(), Name);
1291+
New->transferFlags(*this);
1292+
return New;
1293+
}
1294+
1295+
void execute(VPTransformState &State) override;
1296+
1297+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1298+
/// Print the recipe.
1299+
void print(raw_ostream &O, const Twine &Indent,
1300+
VPSlotTracker &SlotTracker) const override;
1301+
#endif
1302+
1303+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1304+
assert(is_contained(operands(), Op) &&
1305+
"Op must be an operand of the recipe");
1306+
1307+
return true;
1308+
}
1309+
1310+
bool onlyFirstPartUsed(const VPValue *Op) const override {
1311+
assert(is_contained(operands(), Op) &&
1312+
"Op must be an operand of the recipe");
1313+
1314+
return false;
1315+
}
1316+
};
1317+
12741318
/// VPWidenRecipe is a recipe for producing a copy of vector type its
12751319
/// ingredient. This recipe covers most of the traditional vectorization cases
12761320
/// where each ingredient transforms into a vectorized version of itself.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 70 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -308,18 +308,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
308308
Value *Op2 = State.get(getOperand(2), Part);
309309
return Builder.CreateSelect(Cond, Op1, Op2, Name);
310310
}
311-
case VPInstruction::ActiveLaneMask: {
312-
// Get first lane of vector induction variable.
313-
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
314-
// Get the original loop tripcount.
315-
Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
316311

317-
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
318-
auto *PredTy = VectorType::get(Int1Ty, State.VF);
319-
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
320-
{PredTy, ScalarTC->getType()},
321-
{VIVElem0, ScalarTC}, nullptr, Name);
322-
}
323312
case VPInstruction::FirstOrderRecurrenceSplice: {
324313
// Generate code to combine the previous and current values in vector v3.
325314
//
@@ -533,7 +522,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
533522
case Instruction::ICmp:
534523
// TODO: Cover additional opcodes.
535524
return vputils::onlyFirstLaneUsed(this);
536-
case VPInstruction::ActiveLaneMask:
537525
case VPInstruction::CalculateTripCountMinusVF:
538526
case VPInstruction::CanonicalIVIncrementForPart:
539527
case VPInstruction::BranchOnCount:
@@ -567,9 +555,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
567555
case VPInstruction::SLPStore:
568556
O << "combined store";
569557
break;
570-
case VPInstruction::ActiveLaneMask:
571-
O << "active lane mask";
572-
break;
573558
case VPInstruction::FirstOrderRecurrenceSplice:
574559
O << "first-order splice";
575560
break;
@@ -600,8 +585,78 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
600585
DL.print(O);
601586
}
602587
}
588+
589+
void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
590+
VPSlotTracker &SlotTracker) const {
591+
O << Indent << "EMIT ";
592+
593+
printAsOperand(O, SlotTracker);
594+
O << " = active lane mask";
595+
printFlags(O);
596+
printOperands(O, SlotTracker);
597+
598+
if (auto DL = getDebugLoc()) {
599+
O << ", !dbg ";
600+
DL.print(O);
601+
}
602+
}
603+
603604
#endif
604605

606+
void VPActiveLaneMaskRecipe::execute(VPTransformState &State) {
607+
assert(!State.Instance && "VPInstruction executing an Instance");
608+
609+
IRBuilderBase &Builder = State.Builder;
610+
Builder.SetCurrentDebugLocation(getDebugLoc());
611+
612+
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
613+
auto *PredTy = VectorType::get(Int1Ty, State.VF);
614+
615+
unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(),
616+
State.UF * State.VF.getKnownMinValue());
617+
if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() ||
618+
MaxPred % State.VF.getKnownMinValue() != 0) {
619+
for (int Part = State.UF - 1; Part >= 0; --Part) {
620+
// Get first lane of vector induction variable.
621+
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
622+
// Get the original loop tripcount.
623+
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
624+
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
625+
{PredTy, ScalarTC->getType()},
626+
{VIVElem0, ScalarTC}, nullptr, Name);
627+
State.set(this, V, Part);
628+
}
629+
return;
630+
}
631+
632+
// Generate long active lane masks covering all the unrolled iterations.
633+
unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue();
634+
auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable());
635+
SmallVector<Value *> LongMask(State.UF / PartsPerMask, nullptr);
636+
for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) {
637+
// Get first lane of vector induction variable.
638+
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
639+
// Get the original loop tripcount.
640+
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
641+
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
642+
{LongPredTy, ScalarTC->getType()},
643+
{VIVElem0, ScalarTC}, nullptr, Name);
644+
LongMask[Part / PartsPerMask] = V;
645+
}
646+
647+
for (int Part = State.UF - 1; Part >= 0; --Part) {
648+
Value *ALM = LongMask[Part / PartsPerMask];
649+
const unsigned I = Part % PartsPerMask;
650+
Value *V = Builder.CreateIntrinsic(
651+
Intrinsic::vector_extract, {PredTy, ALM->getType()},
652+
{ALM, ConstantInt::get(Type::getInt64Ty(Builder.getContext()),
653+
I * State.VF.getKnownMinValue())},
654+
nullptr, Name);
655+
656+
State.set(this, V, Part);
657+
}
658+
}
659+
605660
void VPWidenCallRecipe::execute(VPTransformState &State) {
606661
assert(State.VF.isVector() && "not widening");
607662
auto &CI = *cast<CallInst>(getUnderlyingInstr());

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -600,8 +600,7 @@ static bool canSimplifyBranchOnCond(VPInstruction *Term) {
600600
if (!Not || Not->getOpcode() != VPInstruction::Not)
601601
return false;
602602

603-
VPInstruction *ALM = dyn_cast<VPInstruction>(Not->getOperand(0));
604-
return ALM && ALM->getOpcode() == VPInstruction::ActiveLaneMask;
603+
return dyn_cast<VPActiveLaneMaskRecipe>(Not->getOperand(0));
605604
}
606605

607606
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -1151,9 +1150,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
11511150
"index.part.next");
11521151

11531152
// Create the active lane mask instruction in the VPlan preheader.
1154-
auto *EntryALM =
1155-
Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
1156-
DL, "active.lane.mask.entry");
1153+
auto *EntryALM = Builder.createGetActiveLaneMask(EntryIncrement, TC, DL,
1154+
"active.lane.mask.entry");
11571155

11581156
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
11591157
// preheader ActiveLaneMask instruction.
@@ -1167,9 +1165,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
11671165
auto *InLoopIncrement =
11681166
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
11691167
{IncrementValue}, {false, false}, DL);
1170-
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
1171-
{InLoopIncrement, TripCount}, DL,
1172-
"active.lane.mask.next");
1168+
auto *ALM = Builder.createGetActiveLaneMask(InLoopIncrement, TripCount, DL,
1169+
"active.lane.mask.next");
11731170
LaneMaskPhi->addOperand(ALM);
11741171

11751172
// Replace the original terminator with BranchOnCond. We have to invert the
@@ -1200,9 +1197,8 @@ void VPlanTransforms::addActiveLaneMask(
12001197
LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
12011198
Plan, DataAndControlFlowWithoutRuntimeCheck);
12021199
} else {
1203-
LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
1204-
{WideCanonicalIV, Plan.getTripCount()},
1205-
nullptr, "active.lane.mask");
1200+
LaneMask = new VPActiveLaneMaskRecipe(WideCanonicalIV, Plan.getTripCount(),
1201+
nullptr, "active.lane.mask");
12061202
LaneMask->insertAfter(WideCanonicalIV);
12071203
}
12081204

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ class VPDef {
360360
VPWidenMemoryInstructionSC,
361361
VPWidenSC,
362362
VPWidenSelectSC,
363+
VPActiveLaneMaskSC,
363364
// START: Phi-like recipes. Need to be kept together.
364365
VPBlendSC,
365366
VPWidenPHISC,

0 commit comments

Comments
 (0)