-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Fix the analysis for masked compress loads #140647
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Fix the analysis for masked compress loads #140647
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesNeed to remove the check for Orders in interleaved loads analysis and Full diff: https://github.com/llvm/llvm-project/pull/140647.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index db4a5713a49a2..2ff93a02fb5c9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5985,10 +5985,9 @@ static bool isMaskedLoadCompress(
// Check for potential segmented(interleaved) loads.
VectorType *AlignedLoadVecTy = getWidenedType(
ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
- if (!isSafeToLoadUnconditionally(
- Ptr0, AlignedLoadVecTy, CommonAlignment, DL,
- cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC,
- &DT, &TLI))
+ if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
+ DL, cast<LoadInst>(VL.back()), &AC, &DT,
+ &TLI))
AlignedLoadVecTy = LoadVecTy;
if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
CommonAlignment,
@@ -5998,9 +5997,6 @@ static bool isMaskedLoadCompress(
Instruction::Load, AlignedLoadVecTy,
CompressMask[1], std::nullopt, CommonAlignment,
LI->getPointerAddressSpace(), CostKind, IsMasked);
- if (!Mask.empty())
- InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
- VecTy, Mask, CostKind);
if (InterleavedCost < GatherCost) {
InterleaveFactor = CompressMask[1];
LoadVecTy = AlignedLoadVecTy;
@@ -6008,6 +6004,8 @@ static bool isMaskedLoadCompress(
}
}
}
+ InstructionCost CompressCost = ::getShuffleCost(
+ TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
if (!Order.empty()) {
SmallVector<int> NewMask(Sz, PoisonMaskElem);
for (unsigned I : seq<unsigned>(Sz)) {
@@ -6015,8 +6013,6 @@ static bool isMaskedLoadCompress(
}
CompressMask.swap(NewMask);
}
- InstructionCost CompressCost = ::getShuffleCost(
- TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
return TotalVecCost < GatherCost;
}
@@ -13626,10 +13622,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallVector<Value *> PointerOps(Scalars.size());
for (auto [I, V] : enumerate(Scalars))
PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
- (void)isMaskedLoadCompress(
+ [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
*TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
CompressMask, LoadVecTy);
+ assert(IsVectorized && "Failed to vectorize load");
CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
InterleaveFactor, IsMasked);
Align CommonAlignment = LI0->getAlign();
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index bce0884e92925..07094c642f8da 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -15,10 +15,11 @@ define i16 @test() {
; CHECK-NEXT: [[PEDGE_061_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ null, [[ENTRY]] ]
; CHECK-NEXT: [[INCDEC_PTR_I]] = getelementptr [[S]], ptr [[PEDGE_061_I]], i64 -1
; CHECK-NEXT: [[PPREV_0_I]] = getelementptr [[S]], ptr [[PPREV_062_I]], i64 -1
-; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[PPREV_0_I]], i64 4, <2 x i1> splat (i1 true), i32 2)
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
-; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr [[PPREV_0_I]], i32 2, <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP4]], [[TMP3]]
; CHECK-NEXT: br label [[WHILE_BODY_I]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
index 1b65a7ac1c311..4dd659a7ae802 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
@@ -9,18 +9,20 @@ define void @test(ptr %mdct_forward_x) {
; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[MDCT_FORWARD_X]], align 8
; CHECK-NEXT: [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
+; CHECK-NEXT: [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> <i64 28, i64 36, i64 24, i64 28>
-; CHECK-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[ARRAYIDX2_I_I]], i64 -8, <2 x i1> splat (i1 true), i32 2)
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 48, i64 40>
-; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP6]], i32 4, <2 x i1> splat (i1 true), <2 x float> poison)
+; CHECK-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP10]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 4>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP22]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP4]], i64 0)
; CHECK-NEXT: [[TMP13:%.*]] = fsub <4 x float> [[TMP9]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = fadd <4 x float> [[TMP9]], [[TMP12]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
index 843d1cf46ffcc..7d65fe1bcde76 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
@@ -9,17 +9,16 @@ define void @test() {
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[M1:%.*]] = alloca [[STRUCT_AE:%.*]], align 8
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[M1]], i64 8
+; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 48
; CHECK-NEXT: [[ARRAYIDX_I4:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 40
; CHECK-NEXT: [[TMP1:%.*]] = load <5 x double>, ptr [[M1]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
+; CHECK-NEXT: [[TMP4:%.*]] = load <6 x double>, ptr [[M1]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX_I5_I]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = load <7 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <7 x double> [[TMP4]], <7 x double> poison, <4 x i32> <i32 5, i32 0, i32 3, i32 6>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> [[TMP7]], <4 x i32> <i32 0, i32 3, i32 4, i32 5>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <5 x double> [[TMP7]], <5 x double> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 9, i32 1>
; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], [[TMP5]]
; CHECK-NEXT: [[TMP10:%.*]] = fptosi <4 x double> [[TMP9]] to <4 x i32>
; CHECK-NEXT: [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x double>
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Need to remove the check for Orders in interleaved loads analysis and estimate shuffle cost without the reordering to correctly handle the costs of masked compress loads. Reviewers: hiraditya, HanKuanChen, RKSimon Reviewed By: HanKuanChen, RKSimon Pull Request: llvm/llvm-project#140647
Need to remove the check for Orders in interleaved loads analysis and estimate shuffle cost without the reordering to correctly handle the costs of masked compress loads. Reviewers: hiraditya, HanKuanChen, RKSimon Reviewed By: HanKuanChen, RKSimon Pull Request: llvm#140647
Need to remove the check for Orders in interleaved loads analysis and estimate shuffle cost without the reordering to correctly handle the costs of masked compress loads. Reviewers: hiraditya, HanKuanChen, RKSimon Reviewed By: HanKuanChen, RKSimon Pull Request: llvm#140647
Need to remove the check for Orders in interleaved loads analysis and
estimate shuffle cost without the reordering to correctly handle the
costs of masked compress loads.