Skip to content

Commit 9c388f1

Browse files
[SLP]Prefer segmented/deinterleaved loads to strided and fix codegen
Need to estimate, which one is preferable, deinterleaved/segmented loads or strided. Segmented loads can be combined, improving the overall performance. Reviewers: RKSimon, hiraditya Reviewed By: hiraditya, RKSimon Pull Request: #135058
1 parent a5a6ae1 commit 9c388f1

File tree

3 files changed

+49
-40
lines changed

3 files changed

+49
-40
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5923,9 +5923,9 @@ static bool isMaskedLoadCompress(
59235923
// Check for very large distances between elements.
59245924
if (*Diff / Sz >= MaxRegSize / 8)
59255925
return false;
5926-
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
59275926
LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
59285927
auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
5928+
Align CommonAlignment = LI->getAlign();
59295929
IsMasked = !isSafeToLoadUnconditionally(
59305930
Ptr0, LoadVecTy, CommonAlignment, DL,
59315931
cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
@@ -5964,26 +5964,28 @@ static bool isMaskedLoadCompress(
59645964
TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
59655965
LI->getPointerAddressSpace(), CostKind);
59665966
} else {
5967-
CommonAlignment = LI->getAlign();
59685967
LoadCost =
59695968
TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
59705969
LI->getPointerAddressSpace(), CostKind);
59715970
}
5972-
if (IsStrided) {
5971+
if (IsStrided && !IsMasked) {
59735972
// Check for potential segmented(interleaved) loads.
5974-
if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1],
5973+
auto *AlignedLoadVecTy = getWidenedType(
5974+
ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
5975+
if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
59755976
CommonAlignment,
59765977
LI->getPointerAddressSpace())) {
59775978
InstructionCost InterleavedCost =
59785979
VectorGEPCost + TTI.getInterleavedMemoryOpCost(
5979-
Instruction::Load, LoadVecTy, CompressMask[1],
5980-
std::nullopt, CommonAlignment,
5980+
Instruction::Load, AlignedLoadVecTy,
5981+
CompressMask[1], std::nullopt, CommonAlignment,
59815982
LI->getPointerAddressSpace(), CostKind, IsMasked);
59825983
if (!Mask.empty())
59835984
InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
59845985
VecTy, Mask, CostKind);
59855986
if (InterleavedCost < GatherCost) {
59865987
InterleaveFactor = CompressMask[1];
5988+
LoadVecTy = AlignedLoadVecTy;
59875989
return true;
59885990
}
59895991
}
@@ -6001,6 +6003,24 @@ static bool isMaskedLoadCompress(
60016003
return TotalVecCost < GatherCost;
60026004
}
60036005

6006+
/// Checks if the \p VL can be transformed to a (masked)load + compress or
6007+
/// (masked) interleaved load.
6008+
static bool
6009+
isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6010+
ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
6011+
const DataLayout &DL, ScalarEvolution &SE,
6012+
AssumptionCache &AC, const DominatorTree &DT,
6013+
const TargetLibraryInfo &TLI,
6014+
const function_ref<bool(Value *)> AreAllUsersVectorized) {
6015+
bool IsMasked;
6016+
unsigned InterleaveFactor;
6017+
SmallVector<int> CompressMask;
6018+
VectorType *LoadVecTy;
6019+
return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6020+
AreAllUsersVectorized, IsMasked, InterleaveFactor,
6021+
CompressMask, LoadVecTy);
6022+
}
6023+
60046024
/// Checks if strided loads can be generated out of \p VL loads with pointers \p
60056025
/// PointerOps:
60066026
/// 1. Target with strided load support is detected.
@@ -6137,6 +6157,12 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
61376157
// Check that the sorted loads are consecutive.
61386158
if (static_cast<unsigned>(*Diff) == Sz - 1)
61396159
return LoadsState::Vectorize;
6160+
if (isMaskedLoadCompress(VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT,
6161+
*TLI, [&](Value *V) {
6162+
return areAllUsersVectorized(
6163+
cast<Instruction>(V), UserIgnoreList);
6164+
}))
6165+
return LoadsState::CompressVectorize;
61406166
// Simple check if not a strided access - clear order.
61416167
bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
61426168
// Try to generate strided load node.
@@ -6150,18 +6176,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
61506176
isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
61516177
IsAnyPointerUsedOutGraph, *Diff))
61526178
return LoadsState::StridedVectorize;
6153-
bool IsMasked;
6154-
unsigned InterleaveFactor;
6155-
SmallVector<int> CompressMask;
6156-
VectorType *LoadVecTy;
6157-
if (isMaskedLoadCompress(
6158-
VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
6159-
[&](Value *V) {
6160-
return areAllUsersVectorized(cast<Instruction>(V),
6161-
UserIgnoreList);
6162-
},
6163-
IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
6164-
return LoadsState::CompressVectorize;
61656179
}
61666180
if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
61676181
TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
@@ -13439,11 +13453,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1343913453
assert(IsVectorized && "Expected to be vectorized");
1344013454
CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
1344113455
InterleaveFactor, IsMasked);
13442-
Align CommonAlignment;
13443-
if (IsMasked)
13444-
CommonAlignment = computeCommonAlignment<LoadInst>(VL);
13445-
else
13446-
CommonAlignment = LI0->getAlign();
13456+
Align CommonAlignment = LI0->getAlign();
1344713457
if (InterleaveFactor) {
1344813458
VecLdCost = TTI->getInterleavedMemoryOpCost(
1344913459
Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt,
@@ -18049,14 +18059,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1804918059
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
1805018060
auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
1805118061
CompressEntryToData.at(E);
18052-
Align CommonAlignment;
18053-
if (IsMasked)
18054-
CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
18055-
else
18056-
CommonAlignment = LI->getAlign();
18062+
Align CommonAlignment = LI->getAlign();
1805718063
if (IsMasked) {
18064+
unsigned VF = getNumElements(LoadVecTy);
1805818065
SmallVector<Constant *> MaskValues(
18059-
getNumElements(LoadVecTy) / getNumElements(LI->getType()),
18066+
VF / getNumElements(LI->getType()),
1806018067
ConstantInt::getFalse(VecTy->getContext()));
1806118068
for (int I : CompressMask)
1806218069
MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());

llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@ define i32 @sum_of_abs_stride_2(ptr noalias %a, ptr noalias %b) {
55
; CHECK-LABEL: define i32 @sum_of_abs_stride_2
66
; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
77
; CHECK-NEXT: entry:
8-
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 2, <8 x i1> splat (i1 true), i32 8)
9-
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
10-
; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
11-
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
12-
; CHECK-NEXT: ret i32 [[TMP3]]
8+
; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i8> @llvm.masked.load.v15i8.p0(ptr [[A]], i32 1, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i8> poison)
9+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <15 x i8> [[TMP0]], <15 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
10+
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP1]], i1 false)
11+
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
12+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
13+
; CHECK-NEXT: ret i32 [[TMP4]]
1314
;
1415
entry:
1516
%0 = load i8, ptr %a, align 1
@@ -57,11 +58,12 @@ define i32 @sum_of_abs_stride_3(ptr noalias %a, ptr noalias %b) {
5758
; CHECK-LABEL: define i32 @sum_of_abs_stride_3
5859
; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
5960
; CHECK-NEXT: entry:
60-
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 3, <8 x i1> splat (i1 true), i32 8)
61-
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
62-
; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
63-
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
64-
; CHECK-NEXT: ret i32 [[TMP3]]
61+
; CHECK-NEXT: [[TMP0:%.*]] = call <22 x i8> @llvm.masked.load.v22i8.p0(ptr [[A]], i32 1, <22 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i8> poison)
62+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <22 x i8> [[TMP0]], <22 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
63+
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP1]], i1 false)
64+
; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
65+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
66+
; CHECK-NEXT: ret i32 [[TMP4]]
6567
;
6668
entry:
6769
%0 = load i8, ptr %a, align 1

llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
define void @test(ptr noalias %0, ptr %p) {
55
; CHECK-LABEL: @test(
66
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2
7-
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 4, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
7+
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 16, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
88
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 15, i32 4, i32 5, i32 0, i32 2, i32 6, i32 7, i32 8>
99
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 15, i32 4, i32 5, i32 15, i32 4, i32 5, i32 15, i32 0, i32 5, i32 2, i32 6, i32 7, i32 8, i32 6, i32 7, i32 8>
1010
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 4, i32 24, i32 15, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>

0 commit comments

Comments
 (0)