Skip to content

Commit 22f9ebd

Browse files
committed
WIP histogram autovec
Mostly functioning all-in-one intrinsic autovec
1 parent 8ad2f17 commit 22f9ebd

File tree

16 files changed

+386
-18
lines changed

16 files changed

+386
-18
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,8 @@ class MemoryDepChecker {
200200
bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
201201
const DenseMap<Value *, const SCEV *> &Strides,
202202
const DenseMap<Value *, SmallVector<const Value *, 16>>
203-
&UnderlyingObjects);
203+
&UnderlyingObjects,
204+
const SmallPtrSetImpl<const Value *> &HistogramPtrs);
204205

205206
/// No memory dependence was encountered that would inhibit
206207
/// vectorization.
@@ -338,7 +339,8 @@ class MemoryDepChecker {
338339
isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
339340
unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
340341
const DenseMap<Value *, SmallVector<const Value *, 16>>
341-
&UnderlyingObjects);
342+
&UnderlyingObjects,
343+
const SmallPtrSetImpl<const Value *> &HistogramPtrs);
342344

343345
/// Check whether the data dependence could prevent store-load
344346
/// forwarding.
@@ -402,6 +404,15 @@ struct PointerDiffInfo {
402404
NeedsFreeze(NeedsFreeze) {}
403405
};
404406

407+
struct HistogramInfo {
408+
Instruction *Load;
409+
Instruction *Update;
410+
Instruction *Store;
411+
412+
HistogramInfo(Instruction *Load, Instruction *Update, Instruction *Store)
413+
: Load(Load), Update(Update), Store(Store) {}
414+
};
415+
405416
/// Holds information about the memory runtime legality checks to verify
406417
/// that a group of pointers do not overlap.
407418
class RuntimePointerChecking {
@@ -621,6 +632,10 @@ class LoopAccessInfo {
621632
unsigned getNumStores() const { return NumStores; }
622633
unsigned getNumLoads() const { return NumLoads;}
623634

635+
const SmallVectorImpl<HistogramInfo> &getHistograms() const {
636+
return Histograms;
637+
}
638+
624639
/// The diagnostics report generated for the analysis. E.g. why we
625640
/// couldn't analyze the loop.
626641
const OptimizationRemarkAnalysis *getReport() const { return Report.get(); }
@@ -733,6 +748,13 @@ class LoopAccessInfo {
733748
/// If an access has a symbolic strides, this maps the pointer value to
734749
/// the stride symbol.
735750
DenseMap<Value *, const SCEV *> SymbolicStrides;
751+
752+
/// Holds the load, update, and store instructions for all histogram-style
753+
/// operations found in the loop.
754+
SmallVector<HistogramInfo, 2> Histograms;
755+
756+
/// Storing Histogram Pointers
757+
SmallPtrSet<const Value *, 2> HistogramPtrs;
736758
};
737759

738760
/// Return the SCEV corresponding to a pointer with the symbolic stride

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,9 @@ class TargetTransformInfo {
985985
/// Return hardware support for population count.
986986
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
987987

988+
/// Returns the cost of generating a vector histogram.
989+
InstructionCost getHistogramCost(Type *Ty) const;
990+
988991
/// Return true if the hardware has a fast square-root instruction.
989992
bool haveFastSqrt(Type *Ty) const;
990993

@@ -1934,6 +1937,7 @@ class TargetTransformInfo::Concept {
19341937
unsigned *Fast) = 0;
19351938
virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
19361939
virtual bool haveFastSqrt(Type *Ty) = 0;
1940+
virtual InstructionCost getHistogramCost(Type *Ty) = 0;
19371941
virtual bool isExpensiveToSpeculativelyExecute(const Instruction *I) = 0;
19381942
virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
19391943
virtual InstructionCost getFPOpCost(Type *Ty) = 0;
@@ -2497,6 +2501,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
24972501
}
24982502
bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }
24992503

2504+
InstructionCost getHistogramCost(Type *Ty) override {
2505+
return Impl.getHistogramCost(Ty);
2506+
}
2507+
25002508
bool isExpensiveToSpeculativelyExecute(const Instruction* I) override {
25012509
return Impl.isExpensiveToSpeculativelyExecute(I);
25022510
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,10 @@ class TargetTransformInfoImplBase {
418418

419419
bool haveFastSqrt(Type *Ty) const { return false; }
420420

421+
InstructionCost getHistogramCost(Type *Ty) const {
422+
return InstructionCost::getInvalid();
423+
}
424+
421425
bool isExpensiveToSpeculativelyExecute(const Instruction *I) { return true; }
422426

423427
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return true; }

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
540540
TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
541541
}
542542

543+
InstructionCost getHistogramCost(Type *Ty) {
544+
return InstructionCost::getInvalid();
545+
}
546+
543547
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
544548
return true;
545549
}

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,23 @@ class LoopVectorizationLegality {
387387
unsigned getNumStores() const { return LAI->getNumStores(); }
388388
unsigned getNumLoads() const { return LAI->getNumLoads(); }
389389

390+
bool isHistogramLoadOrUpdate(Instruction *I) const {
391+
for (const HistogramInfo &HGram : LAI->getHistograms())
392+
if (HGram.Load == I || HGram.Update == I)
393+
return true;
394+
395+
return false;
396+
}
397+
398+
std::optional<const HistogramInfo *>
399+
getHistogramForStore(StoreInst *SI) const {
400+
for (const HistogramInfo &HGram : LAI->getHistograms())
401+
if (HGram.Store == SI)
402+
return &HGram;
403+
404+
return std::nullopt;
405+
}
406+
390407
PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
391408
return &PSE;
392409
}

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 134 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "llvm/ADT/SmallPtrSet.h"
2222
#include "llvm/ADT/SmallSet.h"
2323
#include "llvm/ADT/SmallVector.h"
24+
#include "llvm/ADT/Statistic.h"
2425
#include "llvm/Analysis/AliasAnalysis.h"
2526
#include "llvm/Analysis/AliasSetTracker.h"
2627
#include "llvm/Analysis/LoopAnalysisManager.h"
@@ -70,6 +71,8 @@ using namespace llvm::PatternMatch;
7071

7172
#define DEBUG_TYPE "loop-accesses"
7273

74+
STATISTIC(HistogramsDetected, "Number of Histograms detected");
75+
7376
static cl::opt<unsigned, true>
7477
VectorizationFactor("force-vector-width", cl::Hidden,
7578
cl::desc("Sets the SIMD width. Zero is autoselect."),
@@ -731,6 +734,23 @@ class AccessAnalysis {
731734
return UnderlyingObjects;
732735
}
733736

737+
/// Find Histogram counts that match high-level code in loops:
738+
/// \code
739+
/// buckets[indices[i]]+=step;
740+
/// \endcode
741+
///
742+
/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
743+
/// array the computed histogram. It uses a BinOp to sum all counts, storing
744+
/// them using a loop-variant index Load from the 'indices' input array.
745+
///
746+
/// On successful matches it updates the STATISTIC 'HistogramsDetected',
747+
/// regardless of hardware support. When there is support, it additionally
748+
/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
749+
/// used to update histogram in \p HistogramPtrs.
750+
void findHistograms(StoreInst *HSt,
751+
SmallVectorImpl<HistogramInfo> &Histograms,
752+
SmallPtrSetImpl<const Value *> &HistogramPtrs);
753+
734754
private:
735755
typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
736756

@@ -1948,7 +1968,8 @@ getDependenceDistanceStrideAndSize(
19481968
const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
19491969
const DenseMap<Value *, const SCEV *> &Strides,
19501970
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
1951-
PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
1971+
PredicatedScalarEvolution &PSE, const Loop *InnermostLoop,
1972+
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
19521973
auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
19531974
auto &SE = *PSE.getSE();
19541975
auto [APtr, AIsWrite] = A;
@@ -1966,6 +1987,15 @@ getDependenceDistanceStrideAndSize(
19661987
BPtr->getType()->getPointerAddressSpace())
19671988
return MemoryDepChecker::Dependence::Unknown;
19681989

1990+
// Ignore Histogram count updates as they are handled by the Intrinsic. This
1991+
// happens when the same pointer is first used to read from and then is used
1992+
// to write to.
1993+
if (!AIsWrite && BIsWrite && APtr == BPtr && HistogramPtrs.contains(APtr)) {
1994+
LLVM_DEBUG(dbgs() << "LAA: Histogram: Update is safely ignored. Pointer: "
1995+
<< *APtr);
1996+
return MemoryDepChecker::Dependence::NoDep;
1997+
}
1998+
19691999
int64_t StrideAPtr =
19702000
getPtrStride(PSE, ATy, APtr, InnermostLoop, Strides, true).value_or(0);
19712001
int64_t StrideBPtr =
@@ -2022,15 +2052,15 @@ getDependenceDistanceStrideAndSize(
20222052
MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
20232053
const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
20242054
unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides,
2025-
const DenseMap<Value *, SmallVector<const Value *, 16>>
2026-
&UnderlyingObjects) {
2055+
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
2056+
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
20272057
assert(AIdx < BIdx && "Must pass arguments in program order");
20282058

20292059
// Get the dependence distance, stride, type size and what access writes for
20302060
// the dependence between A and B.
20312061
auto Res = getDependenceDistanceStrideAndSize(
20322062
A, InstMap[AIdx], B, InstMap[BIdx], Strides, UnderlyingObjects, PSE,
2033-
InnermostLoop);
2063+
InnermostLoop, HistogramPtrs);
20342064
if (std::holds_alternative<Dependence::DepType>(Res))
20352065
return std::get<Dependence::DepType>(Res);
20362066

@@ -2266,8 +2296,8 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
22662296
bool MemoryDepChecker::areDepsSafe(
22672297
DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
22682298
const DenseMap<Value *, const SCEV *> &Strides,
2269-
const DenseMap<Value *, SmallVector<const Value *, 16>>
2270-
&UnderlyingObjects) {
2299+
const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects,
2300+
const SmallPtrSetImpl<const Value *> &HistogramPtrs) {
22712301

22722302
MinDepDistBytes = -1;
22732303
SmallPtrSet<MemAccessInfo, 8> Visited;
@@ -2312,7 +2342,7 @@ bool MemoryDepChecker::areDepsSafe(
23122342

23132343
Dependence::DepType Type =
23142344
isDependent(*A.first, A.second, *B.first, B.second, Strides,
2315-
UnderlyingObjects);
2345+
UnderlyingObjects, HistogramPtrs);
23162346
mergeInStatus(Dependence::isSafeForVectorization(Type));
23172347

23182348
// Gather dependences unless we accumulated MaxDependences
@@ -2648,6 +2678,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
26482678
// check.
26492679
Accesses.buildDependenceSets();
26502680

2681+
for (StoreInst *ST : Stores)
2682+
Accesses.findHistograms(ST, Histograms, HistogramPtrs);
2683+
26512684
// Find pointers with computable bounds. We are going to use this information
26522685
// to place a runtime bound check.
26532686
Value *UncomputablePtr = nullptr;
@@ -2672,7 +2705,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
26722705
LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
26732706
CanVecMem = DepChecker->areDepsSafe(
26742707
DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides,
2675-
Accesses.getUnderlyingObjects());
2708+
Accesses.getUnderlyingObjects(), HistogramPtrs);
26762709

26772710
if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
26782711
LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
@@ -3127,6 +3160,99 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
31273160
return *I.first->second;
31283161
}
31293162

3163+
void AccessAnalysis::findHistograms(
3164+
StoreInst *HSt, SmallVectorImpl<HistogramInfo> &Histograms,
3165+
SmallPtrSetImpl<const Value *> &HistogramPtrs) {
3166+
LLVM_DEBUG(dbgs() << "LAA: Attempting to match histogram from " << *HSt
3167+
<< "\n");
3168+
// Store value must come from a Binary Operation.
3169+
Instruction *HPtrInstr = nullptr;
3170+
BinaryOperator *HBinOp = nullptr;
3171+
if (!match(HSt, m_Store(m_BinOp(HBinOp), m_Instruction(HPtrInstr)))) {
3172+
LLVM_DEBUG(dbgs() << "\tNo BinOp\n");
3173+
return;
3174+
}
3175+
3176+
// BinOp must be an Add or a Sub operating modifying the bucket value by a
3177+
// loop invariant amount.
3178+
// FIXME: We assume the loop invariant term is on the RHS.
3179+
// Fine for an immediate/constant, but maybe not a generic value?
3180+
Value *HIncVal = nullptr;
3181+
if (!match(HBinOp, m_Add(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal))) &&
3182+
!match(HBinOp, m_Sub(m_Load(m_Specific(HPtrInstr)), m_Value(HIncVal)))) {
3183+
LLVM_DEBUG(dbgs() << "\tNo matching load\n");
3184+
return;
3185+
}
3186+
Instruction *IndexedLoad = cast<Instruction>(HBinOp->getOperand(0));
3187+
3188+
// The address to store is calculated through a GEP Instruction.
3189+
// FIXME: Support GEPs with more operands.
3190+
GetElementPtrInst *HPtr = dyn_cast<GetElementPtrInst>(HPtrInstr);
3191+
if (!HPtr || HPtr->getNumOperands() > 2) {
3192+
LLVM_DEBUG(dbgs() << "\tToo many GEP operands\n");
3193+
return;
3194+
}
3195+
3196+
// Check that the index is calculated by loading from another array. Ignore
3197+
// any extensions.
3198+
// FIXME: Support indices from other sources that a linear load from memory?
3199+
Value *HIdx = HPtr->getOperand(1);
3200+
Instruction *IdxInst = nullptr;
3201+
// FIXME: Can this fail? Maybe if IdxInst isn't an instruction. Just need to
3202+
// look through extensions, find another way?
3203+
if (!match(HIdx, m_ZExtOrSExtOrSelf(m_Instruction(IdxInst))))
3204+
return;
3205+
3206+
// Currently restricting this to linear addressing when loading indices.
3207+
LoadInst *VLoad = dyn_cast<LoadInst>(IdxInst);
3208+
Value *VPtrVal;
3209+
if (!VLoad || !match(VLoad, m_Load(m_Value(VPtrVal)))) {
3210+
LLVM_DEBUG(dbgs() << "\tBad Index Load\n");
3211+
return;
3212+
}
3213+
3214+
if (!isa<SCEVAddRecExpr>(PSE.getSCEV(VPtrVal))) {
3215+
LLVM_DEBUG(dbgs() << "\tCannot determine index load stride\n");
3216+
return;
3217+
}
3218+
3219+
// FIXME: support smaller types of input arrays. Integers can be promoted
3220+
// for codegen.
3221+
Type *VLoadTy = VLoad->getType();
3222+
if (!VLoadTy->isIntegerTy() || (VLoadTy->getScalarSizeInBits() != 32 &&
3223+
VLoadTy->getScalarSizeInBits() != 64)) {
3224+
LLVM_DEBUG(dbgs() << "\tUnsupported bucket type: " << *VLoadTy << "\n");
3225+
return;
3226+
}
3227+
3228+
// Ensure we'll have the same mask by checking that all parts of the histogram
3229+
// are in the same block.
3230+
// FIXME: Could use dominance checks instead?
3231+
if (IndexedLoad->getParent() != HBinOp->getParent() ||
3232+
IndexedLoad->getParent() != HSt->getParent()) {
3233+
LLVM_DEBUG(dbgs() << "\tDifferent parent blocks\n");
3234+
return;
3235+
}
3236+
3237+
// A histogram pointer may only alias to itself, and must only have two uses,
3238+
// the load and the store.
3239+
for (AliasSet &AS : AST)
3240+
if (AS.isMustAlias() || AS.isMayAlias())
3241+
if ((is_contained(AS.getPointers(), HPtr) && AS.size() > 1) ||
3242+
HPtr->getNumUses() != 2) {
3243+
LLVM_DEBUG(dbgs() << "\tAliasing problem\n");
3244+
return;
3245+
}
3246+
3247+
LLVM_DEBUG(dbgs() << "LAA: Found Histogram Operation: " << *HBinOp << "\n");
3248+
HistogramsDetected++;
3249+
3250+
// Store the operations that make up the histogram.
3251+
Histograms.emplace_back(IndexedLoad, HBinOp, HSt);
3252+
// Store pointers used to write those counts in the computed histogram.
3253+
HistogramPtrs.insert(HPtr);
3254+
}
3255+
31303256
bool LoopAccessInfoManager::invalidate(
31313257
Function &F, const PreservedAnalyses &PA,
31323258
FunctionAnalysisManager::Invalidator &Inv) {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,10 @@ bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
658658
return TTIImpl->haveFastSqrt(Ty);
659659
}
660660

661+
InstructionCost TargetTransformInfo::getHistogramCost(Type *Ty) const {
662+
return TTIImpl->getHistogramCost(Ty);
663+
}
664+
661665
bool TargetTransformInfo::isExpensiveToSpeculativelyExecute(
662666
const Instruction *I) const {
663667
return TTIImpl->isExpensiveToSpeculativelyExecute(I);

0 commit comments

Comments
 (0)