Skip to content

[LV] Vectorize histogram operations #99851

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,18 @@ class LoopVectorizationRequirements {
Instruction *ExactFPMathInst = nullptr;
};

/// This holds details about a histogram operation -- a load -> update -> store
/// sequence where each lane in a vector might be updating the same element as
/// another lane.
struct HistogramInfo {
LoadInst *Load;
Instruction *Update;
StoreInst *Store;

HistogramInfo(LoadInst *Load, Instruction *Update, StoreInst *Store)
: Load(Load), Update(Update), Store(Store) {}
};

/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
Expand Down Expand Up @@ -408,6 +420,20 @@ class LoopVectorizationLegality {
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }

/// Returns a HistogramInfo* for the given instruction if it was determined
/// to be part of a load -> update -> store sequence where multiple lanes
/// may be working on the same memory address.
std::optional<const HistogramInfo *> getHistogramInfo(Instruction *I) const {
for (const HistogramInfo &HGram : Histograms)
if (HGram.Load == I || HGram.Update == I || HGram.Store == I)
return &HGram;

return std::nullopt;
}

/// Returns a list of all known histogram operations in the loop.
bool hasHistograms() const { return !Histograms.empty(); }

PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
return &PSE;
}
Expand Down Expand Up @@ -472,6 +498,11 @@ class LoopVectorizationLegality {
/// Returns true if the loop is vectorizable
bool canVectorizeMemory();

/// If LAA cannot determine whether all dependences are safe, we may be able
/// to further analyse some IndirectUnsafe dependences and if they match a
/// certain pattern (like a histogram) then we may still be able to vectorize.
bool canVectorizeIndirectUnsafeDependences();

/// Return true if we can vectorize this loop using the IF-conversion
/// transformation.
bool canVectorizeWithIfConvert();
Expand Down Expand Up @@ -593,6 +624,11 @@ class LoopVectorizationLegality {
/// conditional assumes.
SmallPtrSet<const Instruction *, 8> MaskedOp;

/// Contains all identified histogram operations, which are sequences of
/// load -> update -> store instructions where multiple lanes in a vector
/// may work on the same memory location.
SmallVector<HistogramInfo, 1> Histograms;

/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
Expand Down
133 changes: 132 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
"Scalable vectorization is available and favored when the "
"cost is inconclusive.")));

static cl::opt<bool> EnableHistogramVectorization(
"enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
cl::desc("Enables autovectorization of some loops containing histograms"));

/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;

Expand Down Expand Up @@ -1051,6 +1055,133 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return true;
}

/// Find histogram operations that match high-level code in loops:
/// \code
/// buckets[indices[i]]+=step;
/// \endcode
///
/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
/// array the computed histogram. It uses a BinOp to sum all counts, storing
/// them using a loop-variant index Load from the 'indices' input array.
///
/// On successful matches it updates the STATISTIC 'HistogramsDetected',
/// regardless of hardware support. When there is support, it additionally
/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
/// used to update histogram in \p HistogramPtrs.
static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
const PredicatedScalarEvolution &PSE,
SmallVectorImpl<HistogramInfo> &Histograms) {

// Store value must come from a Binary Operation.
Instruction *HPtrInstr = nullptr;
BinaryOperator *HBinOp = nullptr;
if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr))))
return false;

// BinOp must be an Add or a Sub modifying the bucket value by a
// loop invariant amount.
// FIXME: We assume the loop invariant term is on the RHS.
// Fine for an immediate/constant, but maybe not a generic value?
Value *HIncVal = nullptr;
if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
!match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))))
return false;

// Make sure the increment value is loop invariant.
if (!TheLoop->isLoopInvariant(HIncVal))
return false;

// The address to store is calculated through a GEP Instruction.
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(HPtrInstr);
if (!GEP)
return false;

// Restrict address calculation to constant indices except for the last term.
Value *HIdx = nullptr;
for (Value *Index : GEP->indices()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all cases exercised in tests? Looks like the tests are all using %arrayidx2 = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1? Should at least have some with additional constant indices?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a test (histogram_array_3op_gep) which exercised this with global arrays, requiring one extra constant index argument.

I've added a second (histogram_array_4op_gep_nonzero_const_idx) with a struct as well, with one of the constant terms being nonzero.

Are there other cases you would like to see?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for adding the extra tests.

if (HIdx)
return false;
if (!isa<ConstantInt>(Index))
HIdx = Index;
}

if (!HIdx)
return false;

// Check that the index is calculated by loading from another array. Ignore
// any extensions.
// FIXME: Support indices from other sources than a linear load from memory?
// We're currently trying to match an operation looping over an array
// of indices, but there could be additional levels of indirection
// in place, or possibly some additional calculation to form the index
// from the loaded data.
Value *VPtrVal;
if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Load(m_Value(VPtrVal)))))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have a tests where the index isn't extended? Coudlnt' find any in sve2-histcnt.ll but might have missed them

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added simple_histogram_64b.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

return false;

// Make sure the index address varies in this loop, not an outer loop.
const auto *AR = dyn_cast<SCEVAddRecExpr>(PSE.getSE()->getSCEV(VPtrVal));
if (!AR || AR->getLoop() != TheLoop)
return false;

// Ensure we'll have the same mask by checking that all parts of the histogram
// (gather load, update, scatter store) are in the same block.
LoadInst *IndexedLoad = cast<LoadInst>(HBinOp->getOperand(0));
BasicBlock *LdBB = IndexedLoad->getParent();
if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
return false;

LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");

// Store the operations that make up the histogram.
Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
return true;
}

bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
// For now, we only support an IndirectUnsafe dependency that calculates
// a histogram
if (!EnableHistogramVectorization)
return false;

// Find a single IndirectUnsafe dependency.
const MemoryDepChecker::Dependence *IUDep = nullptr;
const MemoryDepChecker &DepChecker = LAI->getDepChecker();
const auto *Deps = DepChecker.getDependences();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deps may be nullptr here; LAA only records dependences up to MaxDependences (would be good to have a test using something like -max-dependences=2

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, sve2-histcnt-too-many-deps.ll

// If there were too many dependences, LAA abandons recording them. We can't
// proceed safely if we don't know what the dependences are.
if (!Deps)
return false;

for (const MemoryDepChecker::Dependence &Dep : *Deps) {
// Ignore dependencies that are either known to be safe or can be
// checked at runtime.
if (MemoryDepChecker::Dependence::isSafeForVectorization(Dep.Type) !=
MemoryDepChecker::VectorizationSafetyStatus::Unsafe)
continue;

// We're only interested in IndirectUnsafe dependencies here, where the
// address might come from a load from memory. We also only want to handle
// one such dependency, at least for now.
if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe || IUDep)
return false;

IUDep = &Dep;
}
if (!IUDep)
return false;

// For now only normal loads and stores are supported.
LoadInst *LI = dyn_cast<LoadInst>(IUDep->getSource(DepChecker));
StoreInst *SI = dyn_cast<StoreInst>(IUDep->getDestination(DepChecker));

if (!LI || !SI)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking if we have a test where the source is a store and dest is a load, possibly for loops stepping backwards?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't.

I tried creating one but was never able to get LAA to recognize the store as a source -- it always gave up before that point, possibly due to the indirection. Maybe I'm missing something.

return false;

LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
return findHistogram(LI, SI, TheLoop, LAI->getPSE(), Histograms);
}

bool LoopVectorizationLegality::canVectorizeMemory() {
LAI = &LAIs.getInfo(*TheLoop);
const OptimizationRemarkAnalysis *LAR = LAI->getReport();
Expand All @@ -1062,7 +1193,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
}

if (!LAI->canVectorizeMemory())
return false;
return canVectorizeIndirectUnsafeDependences();

if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
reportVectorizationFailure("We don't allow storing to uniform addresses",
Expand Down
68 changes: 67 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6508,8 +6508,33 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// We've proven all lanes safe to speculate, fall through.
[[fallthrough]];
case Instruction::Add:
case Instruction::Sub: {
auto Info = Legal->getHistogramInfo(I);
if (Info && VF.isVector()) {
const HistogramInfo *HGram = Info.value();
// Assume that a non-constant update value (or a constant != 1) requires
// a multiply, and add that into the cost.
InstructionCost MulCost = TTI::TCC_Free;
ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
if (!RHS || RHS->getZExtValue() != 1)
MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);

// Find the cost of the histogram operation itself.
Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
Type *ScalarTy = I->getType();
Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
Type::getVoidTy(I->getContext()),
{PtrTy, ScalarTy, MaskTy});

// Add the costs together with the add/sub operation.
return TTI.getIntrinsicInstrCost(
ICA, TargetTransformInfo::TCK_RecipThroughput) +
MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
}
[[fallthrough]];
}
Comment on lines +6511 to +6536
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this still needed in the latest version with `::computeCost?

Suggested change
case Instruction::Sub: {
auto Info = Legal->getHistogramInfo(I);
if (Info && VF.isVector()) {
const HistogramInfo *HGram = Info.value();
// Assume that a non-constant update value (or a constant != 1) requires
// a multiply, and add that into the cost.
InstructionCost MulCost = TTI::TCC_Free;
ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
if (!RHS || RHS->getZExtValue() != 1)
MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
// Find the cost of the histogram operation itself.
Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
Type *ScalarTy = I->getType();
Type *MaskTy = VectorType::get(Type::getInt1Ty(I->getContext()), VF);
IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
Type::getVoidTy(I->getContext()),
{PtrTy, ScalarTy, MaskTy});
// Add the costs together with the add/sub operation.
return TTI.getIntrinsicInstrCost(
ICA, TargetTransformInfo::TCK_RecipThroughput) +
MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
}
[[fallthrough]];
}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's still needed for now -- the two cost models disagree and assert if I remove that code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for checking, I though the assert wouldn't trigger due to planContainsAdditionalSimplifications.

case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
Expand Down Expand Up @@ -8426,6 +8451,30 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
};
}

VPHistogramRecipe *
VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
ArrayRef<VPValue *> Operands) {
// FIXME: Support other operations.
unsigned Opcode = HI->Update->getOpcode();
assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
"Histogram update operation must be an Add or Sub");

SmallVector<VPValue *, 3> HGramOps;
// Bucket address.
HGramOps.push_back(Operands[1]);
// Increment value.
HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));

// In case of predicated execution (due to tail-folding, or conditional
// execution, or both), pass the relevant mask.
if (Legal->isMaskRequired(HI->Store))
HGramOps.push_back(getBlockInMask(HI->Store->getParent()));

return new VPHistogramRecipe(Opcode,
make_range(HGramOps.begin(), HGramOps.end()),
HI->Store->getDebugLoc());
}

void VPRecipeBuilder::fixHeaderPhis() {
BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
for (VPHeaderPHIRecipe *R : PhisToFix) {
Expand Down Expand Up @@ -8549,6 +8598,10 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
if (auto *CI = dyn_cast<CallInst>(Instr))
return tryToWidenCall(CI, Operands, Range);

if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
if (auto HistInfo = Legal->getHistogramInfo(SI))
return tryToWidenHistogram(*HistInfo, Operands);

if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);

Expand Down Expand Up @@ -9998,6 +10051,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
InterleaveLoop = false;
}

// If there is a histogram in the loop, do not just interleave without
// vectorizing. The order of operations will be incorrect without the
// histogram intrinsics, which are only used for recipes with VF > 1.
if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
<< "to histogram operations.\n");
IntDiagMsg = std::make_pair(
"HistogramPreventsScalarInterleaving",
"Unable to interleave without vectorization due to constraints on "
"the order of histogram operations");
InterleaveLoop = false;
}

// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;

Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ namespace llvm {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class TargetLibraryInfo;
struct HistogramInfo;

/// Helper class to create VPRecipies from IR instructions.
class VPRecipeBuilder {
Expand Down Expand Up @@ -103,6 +104,13 @@ class VPRecipeBuilder {
VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
VPBasicBlock *VPBB);

/// Makes Histogram count operations safe for vectorization, by emitting a
/// llvm.experimental.vector.histogram.add intrinsic in place of the
/// Load + Add|Sub + Store operations that perform the histogram in the
/// original scalar loop.
VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
ArrayRef<VPValue *> Operands);

public:
VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
LoopVectorizationLegality *Legal,
Expand Down
46 changes: 46 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
case VPRecipeBase::VPWidenStoreSC:
case VPRecipeBase::VPHistogramSC:
// TODO: Widened stores don't define a value, but widened loads do. Split
// the recipes to be able to make widened loads VPSingleDefRecipes.
return false;
Expand Down Expand Up @@ -1664,6 +1665,51 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
#endif
};

/// A recipe representing a sequence of load -> update -> store as part of
/// a histogram operation. This means there may be aliasing between vector
/// lanes, which is handled by the llvm.experimental.vector.histogram family
/// of intrinsics. The only update operations currently supported are
/// 'add' and 'sub' where the other term is loop-invariant.
class VPHistogramRecipe : public VPRecipeBase {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please document the new recipe

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

/// Opcode of the update operation, currently either add or sub.
unsigned Opcode;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: document that this is the opcode of the increment?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


public:
template <typename IterT>
VPHistogramRecipe(unsigned Opcode, iterator_range<IterT> Operands,
DebugLoc DL = {})
: VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {}

~VPHistogramRecipe() override = default;

VPHistogramRecipe *clone() override {
return new VPHistogramRecipe(Opcode, operands(), getDebugLoc());
}

VP_CLASSOF_IMPL(VPDef::VPHistogramSC);

/// Produce a vectorized histogram operation.
void execute(VPTransformState &State) override;

/// Return the cost of this VPHistogramRecipe.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;

unsigned getOpcode() const { return Opcode; }

/// Return the mask operand if one was provided, or a null pointer if all
/// lanes should be executed unconditionally.
VPValue *getMask() const {
return getNumOperands() == 3 ? getOperand(2) : nullptr;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
};

/// A recipe for widening select instructions.
struct VPWidenSelectRecipe : public VPSingleDefRecipe {
template <typename IterT>
Expand Down
Loading