-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Buildvector for alternate instructions with non-profitable gather operands. #84978
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Buildvector for alternate instructions with non-profitable gather operands. #84978
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) Changesoperands. If the operands of the potentially alternate node are going to produce Patch is 24.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/84978.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b8b67609d755fd..085a3b356cf506 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2235,7 +2235,7 @@ class BoUpSLP {
/// of the cost, considered to be good enough score.
std::optional<int>
findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
- int Limit = LookAheadHeuristics::ScoreFail) {
+ int Limit = LookAheadHeuristics::ScoreFail) const {
LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
RootLookAheadMaxDepth);
int BestScore = Limit;
@@ -6019,6 +6019,100 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return TreeEntry::NeedToGather;
}
+ // Check that the operand node does not generate buildvector sequence. If it
+ // is, then probably not worth it to build alternate shuffle, if number of
+ // buildvector operands + alternate instruction > than the number of
+ // buildvector instructions.
+ SmallVector<ValueList> Operands;
+ for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
+ Operands.emplace_back();
+ // Prepare the operand vector.
+ for (Value *V : VL)
+ Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
+ }
+ if (Operands.size() == 2) {
+ // Try find best operands candidates.
+ for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
+ SmallVector<std::pair<Value *, Value *>> Candidates(3);
+ Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
+ Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
+ Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
+ std::optional<int> Res = findBestRootPair(Candidates);
+ switch (Res.value_or(0)) {
+ case 0:
+ break;
+ case 1:
+ std::swap(Operands[0][I + 1], Operands[1][I + 1]);
+ break;
+ case 2:
+ std::swap(Operands[0][I], Operands[1][I]);
+ break;
+ default:
+ llvm_unreachable("Unexpected index.");
+ }
+ }
+ }
+ DenseSet<unsigned> UniqueOpcodes;
+ constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
+ unsigned NonInstCnt = 0;
+ unsigned UndefCnt = 0;
+ unsigned ExtraShuffleInsts = 0;
+ if (Operands.size() == 2) {
+ // Do not count same operands twice.
+ if (Operands.front() == Operands.back()) {
+ Operands.erase(Operands.begin());
+ } else if (!allConstant(Operands.front()) &&
+ all_of(Operands.front(), [&](Value *V) {
+ return is_contained(Operands.back(), V);
+ })) {
+ Operands.erase(Operands.begin());
+ ++ExtraShuffleInsts;
+ }
+ }
+ const Loop *L = LI->getLoopFor(VL0->getParent());
+ if (any_of(Operands,
+ [&](ArrayRef<Value *> Op) {
+ if (allConstant(Op) ||
+ (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
+ getSameOpcode(Op, *TLI).MainOp))
+ return false;
+ DenseMap<Value *, unsigned> Uniques;
+ for (Value *V : Op) {
+ if (isa<Constant, ExtractElementInst>(V) ||
+ getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
+ if (isa<UndefValue>(V))
+ ++UndefCnt;
+ continue;
+ }
+ auto Res = Uniques.try_emplace(V, 0);
+ // Found first duplicate - need to add shuffle.
+ if (!Res.second && Res.first->second == 1)
+ ++ExtraShuffleInsts;
+ ++Res.first->getSecond();
+ if (auto *I = dyn_cast<Instruction>(V))
+ UniqueOpcodes.insert(I->getOpcode());
+ else if (Res.second)
+ ++NonInstCnt;
+ }
+ if (any_of(Uniques, [&](const auto &P) {
+ return P.first->hasNUsesOrMore(P.second + 1) &&
+ none_of(P.first->users(), [&](User *U) {
+ return getTreeEntry(U) || Uniques.contains(U);
+ });
+ }))
+ return false;
+ return true;
+ }) &&
+ (UndefCnt >= (VL.size() - 1) * VL0->getNumOperands() ||
+ (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
+ NumAltInsts) >= VL0->getNumOperands() * VL.size())) {
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: ShuffleVector not vectorized, operands are buildvector and "
+ "the whole alt sequence is not profitable.\n");
+ return TreeEntry::NeedToGather;
+ }
+
return TreeEntry::Vectorize;
}
default:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 44542f32bf145d..5e0cd92caf9258 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -103,16 +103,16 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
; CHECK-NEXT: [[AND95:%.*]] = and i32 [[B_0278]], 1
; CHECK-NEXT: [[SHR96]] = lshr i32 [[A_0279]], 1
; CHECK-NEXT: [[SHR97]] = lshr i32 [[B_0278]], 1
-; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0
-; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT: [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
+; CHECK-NEXT: [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
+; CHECK-NEXT: [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
+; CHECK-NEXT: [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL100]], i32 0
+; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TOBOOL]], i32 1
+; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL98]], i32 0
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i1> [[TMP25]], i1 [[TOBOOL103]], i32 1
+; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i1> [[TMP27]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
@@ -148,16 +148,16 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
; CHECK-NEXT: [[AND134:%.*]] = and i32 [[B_1300]], 1
; CHECK-NEXT: [[SHR135]] = lshr i32 [[A_1301]], 1
; CHECK-NEXT: [[SHR136]] = lshr i32 [[B_1300]], 1
-; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0
-; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
-; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT: [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT: [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
+; CHECK-NEXT: [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
+; CHECK-NEXT: [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
+; CHECK-NEXT: [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
+; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL144]], i32 0
+; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i1> [[TMP40]], i1 [[TOBOOL137]], i32 1
+; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i1> [[TMP41]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL139]], i32 0
+; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP42]], i1 [[TOBOOL147]], i32 1
+; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <4 x i1> [[TMP39]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
; CHECK-NEXT: [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
index 78fc3a60f05142..3c3dea3f1ea886 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
@@ -4,14 +4,12 @@
define void @test(i16 %0) {
; CHECK-LABEL: @test(
; CHECK-NEXT: for.body92.preheader:
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> <i16 0, i16 poison>, i16 [[TMP0:%.*]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, <4 x i32> [[TMP5]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
; CHECK-NEXT: br label [[FOR_BODY92:%.*]]
; CHECK: for.body92:
+; CHECK-NEXT: [[CONV177_I:%.*]] = sext i16 0 to i32
+; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[TMP0:%.*]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, i32 [[CONV177_I]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP1]], i32 2
; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP6]]
; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr undef, align 8
; CHECK-NEXT: br label [[FOR_BODY92]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
index 16ede231c200ec..19a8aa9b618156 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
@@ -6,21 +6,19 @@ define i64 @foo() {
; CHECK-NEXT: bb:
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i64> [ [[TMP5:%.*]], [[BB3]] ]
+; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[ADD:%.*]], [[BB3]] ]
+; CHECK-NEXT: [[PHI2:%.*]] = phi i64 [ [[TMP9:%.*]], [[BB3]] ]
; CHECK-NEXT: ret i64 0
; CHECK: bb3:
; CHECK-NEXT: [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[PHI5]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP5]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> <i64 poison, i64 0>, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP7]] = add <2 x i64> [[TMP6]], [[TMP2]]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
-; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT: [[ADD]] = add i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 0
+; CHECK-NEXT: [[TMP9]] = or i64 [[PHI5]], 0
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0
+; CHECK-NEXT: [[TMP7]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[ADD]], i32 0
; CHECK-NEXT: br i1 false, label [[BB3]], label [[BB1:%.*]]
;
bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
index 3a9eca2bf2e6b6..59cd1c0ccddf8c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
@@ -4,22 +4,22 @@
define void @foo() {
; CHECK-LABEL: define void @foo() {
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP2]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SHL]], i32 0
; CHECK-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
; CHECK-NEXT: [[CALL:%.*]] = call i64 null(i32 [[TMP7]])
; CHECK-NEXT: br label [[BB4]]
; CHECK: bb4:
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1]]
; CHECK: bb5:
-; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP4]], [[BB4]] ]
+; CHECK-NEXT: [[PHI6:%.*]] = phi i32 [ [[SHL]], [[BB4]] ]
+; CHECK-NEXT: [[PHI7:%.*]] = phi i32 [ [[TMP8]], [[BB4]] ]
; CHECK-NEXT: ret void
;
bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
index 09b3d25fd6dc03..65560422da0b74 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
@@ -111,11 +111,10 @@ define void @addsub_and_external_users(ptr %A, ptr %ptr) {
; CHECK-LABEL: @addsub_and_external_users(
; CHECK-NEXT: bb1:
; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[SHUFFLE]], <double 1.100000e+00, double 1.200000e+00>
-; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[SHUFFLE]], <double 1.100000e+00, double 1.200000e+00>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[SUB1:%.*]] = fsub double [[LD]], 1.100000e+00
+; CHECK-NEXT: [[ADD2:%.*]] = fadd double [[LD]], 1.200000e+00
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[SUB1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP0]], double [[ADD2]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.100000e+00, double 2.200000e+00>
; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.100000e+00, double 3.200000e+00>
; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
@@ -158,11 +157,10 @@ define void @subadd_and_external_users(ptr %A, ptr %ptr) {
; CHECK-LABEL: @subadd_and_external_users(
; CHECK-NEXT: bb1:
; CHECK-NEXT: [[LD:%.*]] = load double, ptr undef, align 8
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], <double 1.200000e+00, double 1.100000e+00>
-; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[SHUFFLE]], <double 1.200000e+00, double 1.100000e+00>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[LD]], 1.100000e+00
+; CHECK-NEXT: [[SUB2:%.*]] = fsub double [[LD]], 1.200000e+00
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[SUB2]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP0]], double [[ADD1]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.200000e+00, double 2.100000e+00>
; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.200000e+00, double 3.100000e+00>
; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[A:%.*]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index 17f9f371ff6ef9..813e94ab83adcc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -18,12 +18,10 @@ define void @foo() {
; CHECK: bb4:
; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]]
+; CHECK-NEXT: [[SUB1:%.*]] = fsub double undef, undef
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement...
[truncated]
|
Ping! |
…re profitable Created using spr 1.3.5
Ping! |
Can you provide some performance data (SPEC/llvm-test-suite)? |
Created using spr 1.3.5
Done |
Ping! |
Ping! |
2 similar comments
Ping! |
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one minor
} | ||
} | ||
const Loop *L = LI->getLoopFor(S.MainOp->getParent()); | ||
return none_of(Operands, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a comment describing this
Created using spr 1.3.5
@@ -6074,6 +6194,14 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( | |||
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); | |||
return TreeEntry::NeedToGather; | |||
} | |||
if (!areAltOperandsProfitable(S, VL)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Alexey. Can you please put this under an option (let it be true by default)?
Here is the problem: sometime we create LIT tests deliberately short and force to go through via just making slp-threshold very low. We merely need SLP vectorizer to execute specific path even though it could be unprofitable in a test case scenario. But since this new staff does not check cost threshold at all, we observe changing some tests behavior and they no longer serve their purpose. Having the option we could fix that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Valery, you can add this option in the downstream compiler, should not be a very big change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Of course we can. That will just adds another piece of divergence for things that should not really have existed from the very beginning. I believe similar problem may exist in llvm-project.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here is the example (in this patch): gather-move-out-of-loop
The test intent was to check that if we vectorize, we place gather outside of the loop. Note that there is slp-threshold is there exactly for reason to force slp vectorizer to go further. But after this patch this test no longer serves its purpose. Does that sound like enough justification now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Feel free to prepare the patch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll prepare it. Thanks.
If the operands of the potentially alternate node are going to produce
buildvector sequences, which result in more instructions, than the
original code, then suhinstructions should be vectorized as alternate
node, better to end up with the buildvector node.
Left column - experimental, Right - reference.
Metric: size..text
Program size..text
results results0 diff
test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 413680.00 416272.00 0.6%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12351788.00 12354844.00 0.0%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664901.00 664949.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664901.00 664949.00 0.0%
test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1171371.00 1171355.00 -0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1036396.00 1036284.00 -0.0%
test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test 111280.00 111248.00 -0.0%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1392113.00 1391361.00 -0.1%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1392113.00 1391361.00 -0.1%
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 281676.00 281452.00 -0.1%
test-suite :: MultiSource/Benchmarks/VersaBench/ecbdes/ecbdes.test 3025.00 3019.00 -0.2%
test-suite :: MultiSource/Benchmarks/Prolangs-C/plot2fig/plot2fig.test 6351.00 6335.00 -0.3%
Metric: SLP.NumVectorInstructions
Program SLP.NumVectorInstructions
results results0 diff
test-suite :: MultiSource/Benchmarks/VersaBench/ecbdes/ecbdes.test 15.00 16.00 6.7%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 1703.00 1707.00 0.2%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 1703.00 1707.00 0.2%
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26241.00 26239.00 -0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11761.00 11754.00 -0.1%
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 824.00 822.00 -0.2%
test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 5668.00 5654.00 -0.2%
test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 5668.00 5654.00 -0.2%
test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 792.00 790.00 -0.3%
test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 792.00 790.00 -0.3%
test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test 1389.00 1384.00 -0.4%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 596.00 590.00 -1.0%
test-suite :: MultiSource/Benchmarks/Prolangs-C/plot2fig/plot2fig.test 6.00 5.00 -16.7%
Metric: exec_time
Program exec_time
results results0 diff
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 99.14 100.00 0.9%
Other changes are not significant (less than 0.1% percent with exectime
less 5 secs).
SingleSource/Benchmarks/Adobe-C++/loop_unroll - same small patterns
remain scalar, smaller code.
External/SPEC/CFP2017rate/526.blender_r/526.blender_r - many small
changes, some extra stores gets vectorized.
External/SPEC/CINT2017speed/625.x264_s/625.x264_s
External/SPEC/CINT2017rate/525.x264_r/525.x264_r
x264 has one change in a loop body, in function ssim_end4, some code
remain scalar, resulting in less code size.
External/SPEC/CFP2017rate/511.povray_r/511.povray_r - some extra code
gets vectorized, looks like some other patterns were matched.
MultiSource/Benchmarks/7zip/7zip-benchmark - extra stores were
vectorized (looks like the graphs become profitable)
MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg - small
changes in vectorized code (some small part remain scalar).
External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r
External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s
Many changes cause by the fact that the code of one function becomes
smaller (onvertLCHabToRGB) and this functions gets inlined after that.
MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc - some small
changes here and there, some extra code is vectorized, some remain
scalar (2 x vectors)
MultiSource/Benchmarks/VersaBench/ecbdes/ecbdes - emits 2 scalars
total 5 insts)
MultiSource/Benchmarks/Prolangs-C/plot2fig/plot2fig - small graph
becomes profitable and gets vectorized.
External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r
External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s
Some small graph becomes profitable and gets vectorized.
MultiSource/Benchmarks/FreeBench/pifft/pifft - no changes in final code.