Skip to content

[LoopVectorize][AArch64][SVE] Generate wide active lane masks #81140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1244,6 +1244,8 @@ class TargetTransformInfo {
/// and the number of execution units in the CPU.
unsigned getMaxInterleaveFactor(ElementCount VF) const;

ElementCount getMaxPredicateLength(ElementCount VF) const;

/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
static OperandValueInfo getOperandInfo(const Value *V);

Expand Down Expand Up @@ -2002,6 +2004,9 @@ class TargetTransformInfo::Concept {
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;

virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;

virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;

virtual InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
Expand Down Expand Up @@ -2627,6 +2632,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxInterleaveFactor(ElementCount VF) override {
return Impl.getMaxInterleaveFactor(VF);
}

ElementCount getMaxPredicateLength(ElementCount VF) const override {
return Impl.getMaxPredicateLength(VF);
}

unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
unsigned &JTSize,
ProfileSummaryInfo *PSI,
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,8 @@ class TargetTransformInfoImplBase {

unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }

ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }

InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {

unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }

ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }

InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}

ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
return TTIImpl->getMaxPredicateLength(VF);
}

TargetTransformInfo::OperandValueInfo
TargetTransformInfo::getOperandInfo(const Value *V) {
OperandValueKind OpInfo = OK_AnyValue;
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3383,6 +3383,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
}

ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
// Do not create masks bigger than `<vscale x 16 x i1>`.
unsigned N = ST->hasSVE() ? 16 : 0;
// Do not create masks that are more than twice the VF.
N = std::min(N, 2 * VF.getKnownMinValue());
return VF.isScalable() ? ElementCount::getScalable(N)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the profitability be determined in terms of cost of active.lane.mask + different predicate vectors?

: ElementCount::getFixed(N);
}

// For Falkor, we want to avoid having too many strided loads in a loop since
// that can exhaust the HW prefetcher resources. We adjust the unroller
// MaxCount preference below to attempt to ensure unrolling doesn't create too
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {

unsigned getMaxInterleaveFactor(ElementCount VF);

ElementCount getMaxPredicateLength(ElementCount VF) const;

bool prefersVectorizedAddressing() const;

InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,14 @@ class VPBuilder {
VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
DebugLoc DL = {}, const Twine &Name = "");

VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
const Twine &Name = "") {
auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
if (BB)
BB->insert(ALM, InsertPt);
return ALM;
}

//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,10 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }

ElementCount getMaxPredicateLength(ElementCount VF) const {
return TTI->getMaxPredicateLength(VF);
}

protected:
friend class LoopVectorizationPlanner;

Expand Down Expand Up @@ -7509,7 +7513,8 @@ LoopVectorizationPlanner::executePlan(
LLVM_DEBUG(BestVPlan.dump());

// Perform the actual loop transformation.
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
DT, ILV.Builder, &ILV, &BestVPlan,
OrigLoop->getHeader()->getContext());

// 0. Generate SCEV-dependent code into the preheader, including TripCount,
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,13 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
return It;
}

VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
VPTransformState::VPTransformState(ElementCount VF, unsigned UF,
ElementCount MaxPred, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan,
LLVMContext &Ctx)
: VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
LVer(nullptr),
: VF(VF), UF(UF), MaxPred(MaxPred), CFG(DT), LI(LI), Builder(Builder),
ILV(ILV), Plan(Plan), LVer(nullptr),
TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}

Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
Expand Down
50 changes: 47 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,13 +235,14 @@ struct VPIteration {
/// VPTransformState holds information passed down when "executing" a VPlan,
/// needed for generating the output IR.
struct VPTransformState {
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred,
LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx);

/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
ElementCount VF;
unsigned UF;
ElementCount MaxPred;

/// Hold the indices to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
Expand Down Expand Up @@ -1174,7 +1175,6 @@ class VPInstruction : public VPRecipeWithIRFlags {
Not,
SLPLoad,
SLPStore,
ActiveLaneMask,
ExplicitVectorLength,
CalculateTripCountMinusVF,
// Increment the canonical IV separately for each unrolled part.
Expand Down Expand Up @@ -1329,6 +1329,50 @@ class VPInstruction : public VPRecipeWithIRFlags {
}
};

class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment, it's not clear why it is needed to move this out of VPInstruction, as it still only uses an opcode + operands and no extra data. Depending on what information exactly is used for widen codegen, there may be the need to have a separate class, but in general using VPInstruction when the information can be encoded easily via opcode + VPValue operands only is preferred.

const std::string Name;

public:
VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, DebugLoc DL = {},
const Twine &Name = "")
: VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC,
std::initializer_list<VPValue *>{IV, TC}, DL),
Name(Name.str()) {}

VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC)

VPActiveLaneMaskRecipe *clone() override {
SmallVector<VPValue *, 2> Operands(operands());
assert(Operands.size() == 2 && "by construction");
auto *New = new VPActiveLaneMaskRecipe(Operands[0], Operands[1],
getDebugLoc(), Name);
New->transferFlags(*this);
return New;
}

void execute(VPTransformState &State) override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif

bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");

return true;
}

bool onlyFirstPartUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");

return false;
}
};

/// VPWidenRecipe is a recipe for producing a copy of vector type its
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
Expand Down
34 changes: 29 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,6 @@ m_BranchOnCond(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::BranchOnCond>(Op0);
}

template <typename Op0_t, typename Op1_t>
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
}

template <typename Op0_t, typename Op1_t>
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
Expand Down Expand Up @@ -296,6 +291,35 @@ inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::LogicalAnd>
m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1);
}

template <typename Op0_t, typename Op1_t>
struct VPActiveLaneMask_match {
Op0_t Op0;
Op1_t Op1;

VPActiveLaneMask_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}

bool match(const VPValue *V) {
auto *DefR = V->getDefiningRecipe();
return DefR && match(DefR);
}

bool match(const VPRecipeBase *R) {
auto *DefR = dyn_cast<VPActiveLaneMaskRecipe>(R);
if (!DefR)
return false;
assert(DefR->getNumOperands() == 2 &&
"recipe with matched opcode does not have 2 operands");
return Op0.match(DefR->getOperand(0)) && Op1.match(DefR->getOperand(1));
}
};

template <typename Op0_t, typename Op1_t>
inline VPActiveLaneMask_match<Op0_t, Op1_t>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
return {Op0, Op1};
}

} // namespace VPlanPatternMatch
} // namespace llvm

Expand Down
107 changes: 86 additions & 21 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,24 +351,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
Value *Op2 = State.get(getOperand(2), Part);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
}
case VPInstruction::ActiveLaneMask: {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
// Get the original loop tripcount.
Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));

// If this part of the active lane mask is scalar, generate the CMP directly
// to avoid unnecessary extracts.
if (State.VF.isScalar())
return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,
Name);

auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
auto *PredTy = VectorType::get(Int1Ty, State.VF);
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
}
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
//
Expand Down Expand Up @@ -636,7 +619,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
case VPInstruction::ActiveLaneMask:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
Expand Down Expand Up @@ -671,9 +653,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::SLPStore:
O << "combined store";
break;
case VPInstruction::ActiveLaneMask:
O << "active lane mask";
break;
case VPInstruction::ExplicitVectorLength:
O << "EXPLICIT-VECTOR-LENGTH";
break;
Expand Down Expand Up @@ -713,8 +692,94 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
DL.print(O);
}
}

void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";

printAsOperand(O, SlotTracker);
O << " = active lane mask";
printFlags(O);
printOperands(O, SlotTracker);

if (auto DL = getDebugLoc()) {
O << ", !dbg ";
DL.print(O);
}
}

#endif

void VPActiveLaneMaskRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "VPInstruction executing an Instance");

IRBuilderBase &Builder = State.Builder;
Builder.SetCurrentDebugLocation(getDebugLoc());

// If this the active lane mask is scalar, generate the CMP directly
// to avoid unnecessary extracts.
if (State.VF.isScalar()) {
for (int Part = State.UF - 1; Part >= 0; --Part) {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
// Get the original loop tripcount.
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));

Value *V = Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0,
ScalarTC, Name);
State.set(this, V, Part);
}
return;
}

auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
auto *PredTy = VectorType::get(Int1Ty, State.VF);

unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Conceptually the decision whether to widen the active lane mask or not shouldn't be taken at codegen (::execute), but instead performed as transform (or possibly on construction, if it is simple to determine). This makes both codegen and cost-modeling based on the VPlan easier, as well makes things more explicit in the representation itself.

If this depends on the concrete chosen VF/UF, it can be transformed late in the pipeline (like optimizeForVFAndUF).

State.UF * State.VF.getKnownMinValue());
if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() ||
MaxPred % State.VF.getKnownMinValue() != 0) {
for (int Part = State.UF - 1; Part >= 0; --Part) {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
// Get the original loop tripcount.
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
State.set(this, V, Part);
}
return;
}

// Generate long active lane masks covering all the unrolled iterations.
unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue();
auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable());
SmallVector<Value *> LongMask(State.UF / PartsPerMask, nullptr);
for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
// Get the original loop tripcount.
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{LongPredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
LongMask[Part / PartsPerMask] = V;
}

for (int Part = State.UF - 1; Part >= 0; --Part) {
Value *ALM = LongMask[Part / PartsPerMask];
const unsigned I = Part % PartsPerMask;
Value *V = Builder.CreateIntrinsic(
Intrinsic::vector_extract, {PredTy, ALM->getType()},
{ALM, ConstantInt::get(Type::getInt64Ty(Builder.getContext()),
I * State.VF.getKnownMinValue())},
nullptr, Name);

State.set(this, V, Part);
}
}

void VPWidenCallRecipe::execute(VPTransformState &State) {
assert(State.VF.isVector() && "not widening");
Function *CalledScalarFn = getCalledScalarFunction();
Expand Down
Loading
Loading