Skip to content

Commit c48446e

Browse files
alexey-bataevDanielCChen
authored andcommitted
[SLP]Initial support for interleaved loads
Adds initial support for interleaved loads, which allows emission of segmented loads for RISCV RVV. Vectorizes extra code for RISCV CFP2006/447.dealII, CFP2006/453.povray, CFP2017rate/510.parest_r, CFP2017rate/511.povray_r, CFP2017rate/526.blender_r, CFP2017rate/538.imagick_r, CINT2006/403.gcc, CINT2006/473.astar, CINT2017rate/502.gcc_r, CINT2017rate/525.x264_r Reviewers: RKSimon, preames Reviewed By: preames Pull Request: llvm#112042
1 parent 95605ab commit c48446e

File tree

7 files changed

+379
-277
lines changed

7 files changed

+379
-277
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,12 @@ class TargetTransformInfo {
803803
/// Return true if the target supports strided load.
804804
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
805805

806+
/// Return true is the target supports interleaved access for the given vector
807+
/// type \p VTy, interleave factor \p Factor, alignment \p Alignment and
808+
/// address space \p AddrSpace.
809+
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
810+
Align Alignment, unsigned AddrSpace) const;
811+
806812
// Return true if the target supports masked vector histograms.
807813
bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const;
808814

@@ -1934,6 +1940,10 @@ class TargetTransformInfo::Concept {
19341940
virtual bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) = 0;
19351941
virtual bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) = 0;
19361942
virtual bool isLegalStridedLoadStore(Type *DataType, Align Alignment) = 0;
1943+
virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
1944+
Align Alignment,
1945+
unsigned AddrSpace) = 0;
1946+
19371947
virtual bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) = 0;
19381948
virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
19391949
unsigned Opcode1,
@@ -2456,6 +2466,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
24562466
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) override {
24572467
return Impl.isLegalStridedLoadStore(DataType, Alignment);
24582468
}
2469+
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
2470+
Align Alignment,
2471+
unsigned AddrSpace) override {
2472+
return Impl.isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace);
2473+
}
24592474
bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) override {
24602475
return Impl.isLegalMaskedVectorHistogram(AddrType, DataType);
24612476
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,11 @@ class TargetTransformInfoImplBase {
332332
return false;
333333
}
334334

335+
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
336+
Align Alignment, unsigned AddrSpace) {
337+
return false;
338+
}
339+
335340
bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const {
336341
return false;
337342
}

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,13 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
525525
return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
526526
}
527527

528+
bool TargetTransformInfo::isLegalInterleavedAccessType(
529+
VectorType *VTy, unsigned Factor, Align Alignment,
530+
unsigned AddrSpace) const {
531+
return TTIImpl->isLegalInterleavedAccessType(VTy, Factor, Alignment,
532+
AddrSpace);
533+
}
534+
528535
bool TargetTransformInfo::isLegalMaskedVectorHistogram(Type *AddrType,
529536
Type *DataType) const {
530537
return TTIImpl->isLegalMaskedVectorHistogram(AddrType, DataType);

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
295295
return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
296296
}
297297

298+
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
299+
Align Alignment, unsigned AddrSpace) {
300+
return TLI->isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace,
301+
DL);
302+
}
303+
298304
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment);
299305

300306
bool isVScaleKnownToBeAPowerOfTwo() const {

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 127 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2922,7 +2922,7 @@ class BoUpSLP {
29222922

29232923
/// This is the recursive part of buildTree.
29242924
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2925-
const EdgeInfo &EI);
2925+
const EdgeInfo &EI, unsigned InterleaveFactor = 0);
29262926

29272927
/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
29282928
/// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -3226,7 +3226,15 @@ class BoUpSLP {
32263226
Instruction *MainOp = nullptr;
32273227
Instruction *AltOp = nullptr;
32283228

3229+
/// Interleaving factor for interleaved loads Vectorize nodes.
3230+
unsigned InterleaveFactor = 0;
3231+
32293232
public:
3233+
/// Returns interleave factor for interleave nodes.
3234+
unsigned getInterleaveFactor() const { return InterleaveFactor; }
3235+
/// Sets interleaving factor for the interleaving nodes.
3236+
void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3237+
32303238
/// Set this bundle's \p OpIdx'th operand to \p OpVL.
32313239
void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
32323240
if (Operands.size() < OpIdx + 1)
@@ -3390,7 +3398,12 @@ class BoUpSLP {
33903398
dbgs() << "State: ";
33913399
switch (State) {
33923400
case Vectorize:
3393-
dbgs() << "Vectorize\n";
3401+
if (InterleaveFactor > 0) {
3402+
dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3403+
<< "\n";
3404+
} else {
3405+
dbgs() << "Vectorize\n";
3406+
}
33943407
break;
33953408
case ScatterVectorize:
33963409
dbgs() << "ScatterVectorize\n";
@@ -3460,11 +3473,15 @@ class BoUpSLP {
34603473
const InstructionsState &S,
34613474
const EdgeInfo &UserTreeIdx,
34623475
ArrayRef<int> ReuseShuffleIndices = {},
3463-
ArrayRef<unsigned> ReorderIndices = {}) {
3476+
ArrayRef<unsigned> ReorderIndices = {},
3477+
unsigned InterleaveFactor = 0) {
34643478
TreeEntry::EntryState EntryState =
34653479
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3466-
return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3467-
ReuseShuffleIndices, ReorderIndices);
3480+
TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3481+
ReuseShuffleIndices, ReorderIndices);
3482+
if (E && InterleaveFactor > 0)
3483+
E->setInterleave(InterleaveFactor);
3484+
return E;
34683485
}
34693486

34703487
TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
@@ -6849,7 +6866,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
68496866
return Results;
68506867
};
68516868
auto ProcessGatheredLoads =
6852-
[&](ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
6869+
[&, &TTI = *TTI](
6870+
ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
68536871
bool Final = false) {
68546872
SmallVector<LoadInst *> NonVectorized;
68556873
for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
@@ -6932,11 +6950,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69326950
// distance between scalar loads in these nodes.
69336951
unsigned MaxVF = Slice.size();
69346952
unsigned UserMaxVF = 0;
6953+
unsigned InterleaveFactor = 0;
69356954
if (MaxVF == 2) {
69366955
UserMaxVF = MaxVF;
69376956
} else {
6957+
// Found distance between segments of the interleaved loads.
6958+
std::optional<unsigned> InterleavedLoadsDistance = 0;
6959+
unsigned Order = 0;
69386960
std::optional<unsigned> CommonVF = 0;
69396961
DenseMap<const TreeEntry *, unsigned> EntryToPosition;
6962+
SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
69406963
for (auto [Idx, V] : enumerate(Slice)) {
69416964
for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
69426965
UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
@@ -6951,12 +6974,59 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69516974
if (*CommonVF != E->Scalars.size())
69526975
CommonVF.reset();
69536976
}
6977+
// Check if the load is the part of the interleaved load.
6978+
if (Pos != Idx && InterleavedLoadsDistance) {
6979+
if (!DeinterleavedNodes.contains(E) &&
6980+
any_of(E->Scalars, [&, Slice = Slice](Value *V) {
6981+
if (isa<Constant>(V))
6982+
return false;
6983+
if (getTreeEntry(V))
6984+
return true;
6985+
const auto &Nodes = ValueToGatherNodes.at(V);
6986+
return (Nodes.size() != 1 || !Nodes.contains(E)) &&
6987+
!is_contained(Slice, V);
6988+
})) {
6989+
InterleavedLoadsDistance.reset();
6990+
continue;
6991+
}
6992+
DeinterleavedNodes.insert(E);
6993+
if (*InterleavedLoadsDistance == 0) {
6994+
InterleavedLoadsDistance = Idx - Pos;
6995+
continue;
6996+
}
6997+
if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
6998+
(Idx - Pos) / *InterleavedLoadsDistance < Order)
6999+
InterleavedLoadsDistance.reset();
7000+
Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7001+
}
7002+
}
7003+
}
7004+
DeinterleavedNodes.clear();
7005+
// Check if the large load represents interleaved load operation.
7006+
if (InterleavedLoadsDistance.value_or(0) > 1 &&
7007+
CommonVF.value_or(0) != 0) {
7008+
InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7009+
unsigned VF = *CommonVF;
7010+
OrdersType Order;
7011+
SmallVector<Value *> PointerOps;
7012+
// Segmented load detected - vectorize at maximum vector factor.
7013+
if (TTI.isLegalInterleavedAccessType(
7014+
getWidenedType(Slice.front()->getType(), VF),
7015+
InterleaveFactor,
7016+
cast<LoadInst>(Slice.front())->getAlign(),
7017+
cast<LoadInst>(Slice.front())
7018+
->getPointerAddressSpace()) &&
7019+
canVectorizeLoads(Slice, Slice.front(), Order,
7020+
PointerOps) == LoadsState::Vectorize) {
7021+
UserMaxVF = InterleaveFactor * VF;
7022+
} else {
7023+
InterleaveFactor = 0;
69547024
}
69557025
}
69567026
// Cannot represent the loads as consecutive vectorizable nodes -
69577027
// just exit.
69587028
unsigned ConsecutiveNodesSize = 0;
6959-
if (!LoadEntriesToVectorize.empty() &&
7029+
if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
69607030
any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
69617031
[&, Slice = Slice](const auto &P) {
69627032
const auto *It = find_if(Slice, [&](Value *V) {
@@ -6976,7 +7046,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
69767046
continue;
69777047
// Try to build long masked gather loads.
69787048
UserMaxVF = bit_ceil(UserMaxVF);
6979-
if (any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7049+
if (InterleaveFactor == 0 &&
7050+
any_of(seq<unsigned>(Slice.size() / UserMaxVF),
69807051
[&, Slice = Slice](unsigned Idx) {
69817052
OrdersType Order;
69827053
SmallVector<Value *> PointerOps;
@@ -7008,9 +7079,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
70087079
}))
70097080
continue;
70107081
unsigned Sz = VectorizableTree.size();
7011-
buildTree_rec(SubSlice, 0, EdgeInfo());
7082+
buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
70127083
if (Sz == VectorizableTree.size()) {
70137084
IsVectorized = false;
7085+
// Try non-interleaved vectorization with smaller vector
7086+
// factor.
7087+
if (InterleaveFactor > 0) {
7088+
VF = 2 * (MaxVF / InterleaveFactor);
7089+
InterleaveFactor = 0;
7090+
}
70147091
continue;
70157092
}
70167093
}
@@ -7374,6 +7451,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
73747451
}
73757452
return TreeEntry::ScatterVectorize;
73767453
case LoadsState::StridedVectorize:
7454+
if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7455+
// Delay slow vectorized nodes for better vectorization attempts.
7456+
LoadEntriesToVectorize.insert(VectorizableTree.size());
7457+
return TreeEntry::NeedToGather;
7458+
}
73777459
return TreeEntry::StridedVectorize;
73787460
case LoadsState::Gather:
73797461
#ifndef NDEBUG
@@ -7707,7 +7789,8 @@ class PHIHandler {
77077789
} // namespace
77087790

77097791
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7710-
const EdgeInfo &UserTreeIdx) {
7792+
const EdgeInfo &UserTreeIdx,
7793+
unsigned InterleaveFactor) {
77117794
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
77127795

77137796
SmallVector<int> ReuseShuffleIndices;
@@ -8185,7 +8268,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
81858268
switch (State) {
81868269
case TreeEntry::Vectorize:
81878270
TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8188-
ReuseShuffleIndices, CurrentOrder);
8271+
ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
81898272
if (CurrentOrder.empty())
81908273
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
81918274
else
@@ -9895,6 +9978,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
98959978
Idx = EMask[Idx];
98969979
}
98979980
CommonVF = E->Scalars.size();
9981+
} else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
9982+
Factor && E->Scalars.size() != Mask.size() &&
9983+
ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
9984+
*Factor)) {
9985+
// Deinterleaved nodes are free.
9986+
std::iota(CommonMask.begin(), CommonMask.end(), 0);
98989987
}
98999988
ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
99009989
V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
@@ -10968,23 +11057,38 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1096811057
auto *LI0 = cast<LoadInst>(VL0);
1096911058
auto GetVectorCost = [&](InstructionCost CommonCost) {
1097011059
InstructionCost VecLdCost;
10971-
if (E->State == TreeEntry::Vectorize) {
10972-
VecLdCost = TTI->getMemoryOpCost(
10973-
Instruction::Load, VecTy, LI0->getAlign(),
10974-
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
10975-
} else if (E->State == TreeEntry::StridedVectorize) {
11060+
switch (E->State) {
11061+
case TreeEntry::Vectorize:
11062+
if (unsigned Factor = E->getInterleaveFactor()) {
11063+
VecLdCost = TTI->getInterleavedMemoryOpCost(
11064+
Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11065+
LI0->getPointerAddressSpace(), CostKind);
11066+
11067+
} else {
11068+
VecLdCost = TTI->getMemoryOpCost(
11069+
Instruction::Load, VecTy, LI0->getAlign(),
11070+
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11071+
}
11072+
break;
11073+
case TreeEntry::StridedVectorize: {
1097611074
Align CommonAlignment =
1097711075
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
1097811076
VecLdCost = TTI->getStridedMemoryOpCost(
1097911077
Instruction::Load, VecTy, LI0->getPointerOperand(),
1098011078
/*VariableMask=*/false, CommonAlignment, CostKind);
10981-
} else {
10982-
assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
11079+
break;
11080+
}
11081+
case TreeEntry::ScatterVectorize: {
1098311082
Align CommonAlignment =
1098411083
computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
1098511084
VecLdCost = TTI->getGatherScatterOpCost(
1098611085
Instruction::Load, VecTy, LI0->getPointerOperand(),
1098711086
/*VariableMask=*/false, CommonAlignment, CostKind);
11087+
break;
11088+
}
11089+
case TreeEntry::CombinedVectorize:
11090+
case TreeEntry::NeedToGather:
11091+
llvm_unreachable("Unexpected vectorization state.");
1098811092
}
1098911093
return VecLdCost + CommonCost;
1099011094
};
@@ -11397,6 +11501,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
1139711501
}))
1139811502
return false;
1139911503

11504+
if (VectorizableTree.back()->isGather() &&
11505+
VectorizableTree.back()->isAltShuffle() &&
11506+
VectorizableTree.back()->getVectorFactor() > 2)
11507+
return false;
11508+
1140011509
assert(VectorizableTree.empty()
1140111510
? ExternalUses.empty()
1140211511
: true && "We shouldn't have any external users");

0 commit comments

Comments
 (0)