-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LoopVectorize][AArch64][SVE] Generate wide active lane masks #81140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -235,13 +235,14 @@ struct VPIteration { | |
/// VPTransformState holds information passed down when "executing" a VPlan, | ||
/// needed for generating the output IR. | ||
struct VPTransformState { | ||
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, | ||
DominatorTree *DT, IRBuilderBase &Builder, | ||
VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred, | ||
LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, | ||
InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx); | ||
|
||
/// The chosen Vectorization and Unroll Factors of the loop being vectorized. | ||
ElementCount VF; | ||
unsigned UF; | ||
ElementCount MaxPred; | ||
|
||
/// Hold the indices to generate specific scalar instructions. Null indicates | ||
/// that all instances are to be generated, using either scalar or vector | ||
|
@@ -1174,7 +1175,6 @@ class VPInstruction : public VPRecipeWithIRFlags { | |
Not, | ||
SLPLoad, | ||
SLPStore, | ||
ActiveLaneMask, | ||
ExplicitVectorLength, | ||
CalculateTripCountMinusVF, | ||
// Increment the canonical IV separately for each unrolled part. | ||
|
@@ -1329,6 +1329,50 @@ class VPInstruction : public VPRecipeWithIRFlags { | |
} | ||
}; | ||
|
||
class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the moment, it's not clear why it is needed to move this out of VPInstruction, as it still only uses an opcode + operands and no extra data. Depending on what information exactly is used for widen codegen, there may be the need to have a separate class, but in general using VPInstruction when the information can be encoded easily via opcode + VPValue operands only is preferred. |
||
const std::string Name; | ||
|
||
public: | ||
VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, DebugLoc DL = {}, | ||
const Twine &Name = "") | ||
: VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC, | ||
std::initializer_list<VPValue *>{IV, TC}, DL), | ||
Name(Name.str()) {} | ||
|
||
VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC) | ||
|
||
VPActiveLaneMaskRecipe *clone() override { | ||
SmallVector<VPValue *, 2> Operands(operands()); | ||
assert(Operands.size() == 2 && "by construction"); | ||
auto *New = new VPActiveLaneMaskRecipe(Operands[0], Operands[1], | ||
getDebugLoc(), Name); | ||
New->transferFlags(*this); | ||
return New; | ||
} | ||
|
||
void execute(VPTransformState &State) override; | ||
|
||
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | ||
/// Print the recipe. | ||
void print(raw_ostream &O, const Twine &Indent, | ||
VPSlotTracker &SlotTracker) const override; | ||
#endif | ||
|
||
bool onlyFirstLaneUsed(const VPValue *Op) const override { | ||
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
|
||
return true; | ||
} | ||
|
||
bool onlyFirstPartUsed(const VPValue *Op) const override { | ||
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
|
||
return false; | ||
} | ||
}; | ||
|
||
/// VPWidenRecipe is a recipe for producing a copy of vector type its | ||
/// ingredient. This recipe covers most of the traditional vectorization cases | ||
/// where each ingredient transforms into a vectorized version of itself. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -351,24 +351,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { | |
Value *Op2 = State.get(getOperand(2), Part); | ||
return Builder.CreateSelect(Cond, Op1, Op2, Name); | ||
} | ||
case VPInstruction::ActiveLaneMask: { | ||
// Get first lane of vector induction variable. | ||
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); | ||
// Get the original loop tripcount. | ||
Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0)); | ||
|
||
// If this part of the active lane mask is scalar, generate the CMP directly | ||
// to avoid unnecessary extracts. | ||
if (State.VF.isScalar()) | ||
return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC, | ||
Name); | ||
|
||
auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); | ||
auto *PredTy = VectorType::get(Int1Ty, State.VF); | ||
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, | ||
{PredTy, ScalarTC->getType()}, | ||
{VIVElem0, ScalarTC}, nullptr, Name); | ||
} | ||
case VPInstruction::FirstOrderRecurrenceSplice: { | ||
// Generate code to combine the previous and current values in vector v3. | ||
// | ||
|
@@ -636,7 +619,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { | |
case VPInstruction::PtrAdd: | ||
// TODO: Cover additional opcodes. | ||
return vputils::onlyFirstLaneUsed(this); | ||
case VPInstruction::ActiveLaneMask: | ||
case VPInstruction::ExplicitVectorLength: | ||
case VPInstruction::CalculateTripCountMinusVF: | ||
case VPInstruction::CanonicalIVIncrementForPart: | ||
|
@@ -671,9 +653,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, | |
case VPInstruction::SLPStore: | ||
O << "combined store"; | ||
break; | ||
case VPInstruction::ActiveLaneMask: | ||
O << "active lane mask"; | ||
break; | ||
case VPInstruction::ExplicitVectorLength: | ||
O << "EXPLICIT-VECTOR-LENGTH"; | ||
break; | ||
|
@@ -713,8 +692,94 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, | |
DL.print(O); | ||
} | ||
} | ||
|
||
void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent, | ||
VPSlotTracker &SlotTracker) const { | ||
O << Indent << "EMIT "; | ||
|
||
printAsOperand(O, SlotTracker); | ||
O << " = active lane mask"; | ||
printFlags(O); | ||
printOperands(O, SlotTracker); | ||
|
||
if (auto DL = getDebugLoc()) { | ||
O << ", !dbg "; | ||
DL.print(O); | ||
} | ||
} | ||
|
||
#endif | ||
|
||
void VPActiveLaneMaskRecipe::execute(VPTransformState &State) { | ||
assert(!State.Instance && "VPInstruction executing an Instance"); | ||
|
||
IRBuilderBase &Builder = State.Builder; | ||
Builder.SetCurrentDebugLocation(getDebugLoc()); | ||
|
||
// If this the active lane mask is scalar, generate the CMP directly | ||
// to avoid unnecessary extracts. | ||
if (State.VF.isScalar()) { | ||
for (int Part = State.UF - 1; Part >= 0; --Part) { | ||
// Get first lane of vector induction variable. | ||
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); | ||
// Get the original loop tripcount. | ||
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0)); | ||
|
||
Value *V = Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, | ||
ScalarTC, Name); | ||
State.set(this, V, Part); | ||
} | ||
return; | ||
} | ||
|
||
auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); | ||
auto *PredTy = VectorType::get(Int1Ty, State.VF); | ||
|
||
unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Conceptually the decision whether to widen the active lane mask or not shouldn't be taken at codegen (::execute), but instead performed as transform (or possibly on construction, if it is simple to determine). This makes both codegen and cost-modeling based on the VPlan easier, as well makes things more explicit in the representation itself. If this depends on the concrete chosen VF/UF, it can be transformed late in the pipeline (like |
||
State.UF * State.VF.getKnownMinValue()); | ||
if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() || | ||
MaxPred % State.VF.getKnownMinValue() != 0) { | ||
for (int Part = State.UF - 1; Part >= 0; --Part) { | ||
// Get first lane of vector induction variable. | ||
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); | ||
// Get the original loop tripcount. | ||
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0)); | ||
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, | ||
{PredTy, ScalarTC->getType()}, | ||
{VIVElem0, ScalarTC}, nullptr, Name); | ||
State.set(this, V, Part); | ||
} | ||
return; | ||
} | ||
|
||
// Generate long active lane masks covering all the unrolled iterations. | ||
unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue(); | ||
auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable()); | ||
SmallVector<Value *> LongMask(State.UF / PartsPerMask, nullptr); | ||
for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) { | ||
// Get first lane of vector induction variable. | ||
Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); | ||
// Get the original loop tripcount. | ||
Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0)); | ||
Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, | ||
{LongPredTy, ScalarTC->getType()}, | ||
{VIVElem0, ScalarTC}, nullptr, Name); | ||
LongMask[Part / PartsPerMask] = V; | ||
} | ||
|
||
for (int Part = State.UF - 1; Part >= 0; --Part) { | ||
Value *ALM = LongMask[Part / PartsPerMask]; | ||
const unsigned I = Part % PartsPerMask; | ||
Value *V = Builder.CreateIntrinsic( | ||
Intrinsic::vector_extract, {PredTy, ALM->getType()}, | ||
{ALM, ConstantInt::get(Type::getInt64Ty(Builder.getContext()), | ||
I * State.VF.getKnownMinValue())}, | ||
nullptr, Name); | ||
|
||
State.set(this, V, Part); | ||
} | ||
} | ||
|
||
void VPWidenCallRecipe::execute(VPTransformState &State) { | ||
assert(State.VF.isVector() && "not widening"); | ||
Function *CalledScalarFn = getCalledScalarFunction(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the profitability be determined in terms of cost of active.lane.mask + different predicate vectors?