Skip to content

Commit 3918ef3

Browse files
[SLP]Fix the analysis for masked compress loads
Need to remove the check for Orders in interleaved loads analysis and estimate shuffle cost without the reordering to correctly handle the costs of masked compress loads. Reviewers: hiraditya, HanKuanChen, RKSimon Reviewed By: HanKuanChen, RKSimon Pull Request: #140647
1 parent 48a2836 commit 3918ef3

File tree

4 files changed

+25
-26
lines changed

4 files changed

+25
-26
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5944,10 +5944,9 @@ static bool isMaskedLoadCompress(
59445944
// Check for potential segmented(interleaved) loads.
59455945
VectorType *AlignedLoadVecTy = getWidenedType(
59465946
ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
5947-
if (!isSafeToLoadUnconditionally(
5948-
Ptr0, AlignedLoadVecTy, CommonAlignment, DL,
5949-
cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC,
5950-
&DT, &TLI))
5947+
if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
5948+
DL, cast<LoadInst>(VL.back()), &AC, &DT,
5949+
&TLI))
59515950
AlignedLoadVecTy = LoadVecTy;
59525951
if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
59535952
CommonAlignment,
@@ -5957,25 +5956,22 @@ static bool isMaskedLoadCompress(
59575956
Instruction::Load, AlignedLoadVecTy,
59585957
CompressMask[1], std::nullopt, CommonAlignment,
59595958
LI->getPointerAddressSpace(), CostKind, IsMasked);
5960-
if (!Mask.empty())
5961-
InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
5962-
VecTy, Mask, CostKind);
59635959
if (InterleavedCost < GatherCost) {
59645960
InterleaveFactor = CompressMask[1];
59655961
LoadVecTy = AlignedLoadVecTy;
59665962
return true;
59675963
}
59685964
}
59695965
}
5966+
InstructionCost CompressCost = ::getShuffleCost(
5967+
TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
59705968
if (!Order.empty()) {
59715969
SmallVector<int> NewMask(Sz, PoisonMaskElem);
59725970
for (unsigned I : seq<unsigned>(Sz)) {
59735971
NewMask[I] = CompressMask[Mask[I]];
59745972
}
59755973
CompressMask.swap(NewMask);
59765974
}
5977-
InstructionCost CompressCost = ::getShuffleCost(
5978-
TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
59795975
InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
59805976
return TotalVecCost < GatherCost;
59815977
}
@@ -13553,10 +13549,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1355313549
SmallVector<Value *> PointerOps(Scalars.size());
1355413550
for (auto [I, V] : enumerate(Scalars))
1355513551
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
13556-
(void)isMaskedLoadCompress(
13552+
[[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
1355713553
Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
1355813554
*TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
1355913555
CompressMask, LoadVecTy);
13556+
assert(IsVectorized && "Failed to vectorize load");
1356013557
CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
1356113558
InterleaveFactor, IsMasked);
1356213559
Align CommonAlignment = LI0->getAlign();

llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ define i16 @test() {
1515
; CHECK-NEXT: [[PEDGE_061_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ null, [[ENTRY]] ]
1616
; CHECK-NEXT: [[INCDEC_PTR_I]] = getelementptr [[S]], ptr [[PEDGE_061_I]], i64 -1
1717
; CHECK-NEXT: [[PPREV_0_I]] = getelementptr [[S]], ptr [[PPREV_062_I]], i64 -1
18-
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[PPREV_0_I]], i64 4, <2 x i1> splat (i1 true), i32 2)
19-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
20-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
21-
; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP3]], [[TMP2]]
18+
; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr [[PPREV_0_I]], i32 2, <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> poison)
19+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <2 x i32> <i32 0, i32 2>
20+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
21+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
22+
; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP4]], [[TMP3]]
2223
; CHECK-NEXT: br label [[WHILE_BODY_I]]
2324
;
2425
entry:

llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,20 @@ define void @test(ptr %mdct_forward_x) {
99
; CHECK: [[FOR_COND]]:
1010
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[MDCT_FORWARD_X]], align 8
1111
; CHECK-NEXT: [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
12+
; CHECK-NEXT: [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
1213
; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24
1314
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
1415
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
1516
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> <i64 28, i64 36, i64 24, i64 28>
16-
; CHECK-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[ARRAYIDX2_I_I]], i64 -8, <2 x i1> splat (i1 true), i32 2)
17-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <2 x i32> zeroinitializer
18-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 48, i64 40>
19-
; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP6]], i32 4, <2 x i1> splat (i1 true), <2 x float> poison)
17+
; CHECK-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
18+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
19+
; CHECK-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
20+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
2021
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
21-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
22-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
23-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP10]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 4>
22+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
23+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
24+
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
25+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP22]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
2426
; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP4]], i64 0)
2527
; CHECK-NEXT: [[TMP13:%.*]] = fsub <4 x float> [[TMP9]], [[TMP12]]
2628
; CHECK-NEXT: [[TMP14:%.*]] = fadd <4 x float> [[TMP9]], [[TMP12]]

llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,16 @@ define void @test() {
99
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
1010
; CHECK-NEXT: [[ENTRY:.*:]]
1111
; CHECK-NEXT: [[M1:%.*]] = alloca [[STRUCT_AE:%.*]], align 8
12-
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[M1]], i64 8
12+
; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 48
1313
; CHECK-NEXT: [[ARRAYIDX_I4:%.*]] = getelementptr i8, ptr null, i64 16
14-
; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 40
1514
; CHECK-NEXT: [[TMP1:%.*]] = load <5 x double>, ptr [[M1]], align 8
1615
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
16+
; CHECK-NEXT: [[TMP4:%.*]] = load <6 x double>, ptr [[M1]], align 8
17+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
1718
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX_I5_I]], align 8
18-
; CHECK-NEXT: [[TMP4:%.*]] = load <7 x double>, ptr [[TMP0]], align 8
19-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <7 x double> [[TMP4]], <7 x double> poison, <4 x i32> <i32 5, i32 0, i32 3, i32 6>
2019
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
2120
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>
22-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> [[TMP7]], <4 x i32> <i32 0, i32 3, i32 4, i32 5>
21+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <5 x double> [[TMP7]], <5 x double> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 9, i32 1>
2322
; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], [[TMP5]]
2423
; CHECK-NEXT: [[TMP10:%.*]] = fptosi <4 x double> [[TMP9]] to <4 x i32>
2524
; CHECK-NEXT: [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x double>

0 commit comments

Comments
 (0)