-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Try to vectorize small graph with extractelements, used in buildvector. #83581
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Try to vectorize small graph with extractelements, used in buildvector. #83581
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesIf the graph incudes only single "gather" node with only Patch is 78.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83581.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3e75b5ed485770..f49ca17a67cb40 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8985,6 +8985,20 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
if (isFullyVectorizableTinyTree(ForReduction))
return false;
+ // Check if any of the gather node forms an insertelement buildvector
+ // somewhere.
+ if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::NeedToGather &&
+ all_of(TE->Scalars, [](Value *V) {
+ return isa<ExtractElementInst, UndefValue>(V) ||
+ (!V->hasNUsesOrMore(UsesLimit) &&
+ any_of(V->users(), [](User *U) {
+ return isa<InsertElementInst>(U);
+ }));
+ });
+ }))
+ return false;
+
assert(VectorizableTree.empty()
? ExternalUses.empty()
: true && "We shouldn't have any external users");
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 97008ac6785961..ed73f7b134465a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,375 +1,362 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-20 | FileCheck %s
define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
; CHECK-LABEL: define i32 @test(
; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP3]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr i8, ptr [[PIX1]], i64 2
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 5, i64 7>
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 5, i64 7>
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 2
+; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
+; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32
; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]]
; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
-; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP9]] to i32
-; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
+; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ADD_PTR644]], align 1
; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
-; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
-; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP14]] to i32
; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
+; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
+; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
+; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP26:%.*]] = add <2 x i32> [[TMP25]], [[TMP19]]
; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
-; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
-; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> poison, i8 [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[TMP15]], i32 1
-; CHECK-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> poison, i8 [[TMP14]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i8> [[TMP20]], i8 [[TMP16]], i32 1
-; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP19]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR_1]], i32 0
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x ptr> [[TMP24]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP27:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP26]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
+; CHECK-NEXT: [[TMP27:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
; CHECK-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR64_1]], i32 0
-; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x ptr> [[TMP29]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP31]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP29:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
+; CHECK-NEXT: [[TMP31:%.*]] = sub <2 x i32> [[TMP28]], [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
; CHECK-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; CHECK-NEXT: [[TMP34:%.*]] = sub <2 x i32> [[TMP28]], [[TMP33]]
-; CHECK-NEXT: [[TMP35:%.*]] = shl <2 x i32> [[TMP34]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], [[TMP23]]
-; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP37]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
-; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP40]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT: [[TMP43:%.*]] = sub <2 x i32> [[TMP39]], [[TMP42]]
-; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP45:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP44]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP46:%.*]] = zext <2 x i8> [[TMP45]] to <2 x i32>
-; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT: [[TMP48:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP47]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32>
-; CHECK-NEXT: [[TMP50:%.*]] = sub <2 x i32> [[TMP46]], [[TMP49]]
-; CHECK-NEXT: [[TMP51:%.*]] = shl <2 x i32> [[TMP50]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP52:%.*]] = add <2 x i32> [[TMP51]], [[TMP43]]
-; CHECK-NEXT: [[TMP53:%.*]] = sub <2 x i32> [[TMP36]], [[TMP52]]
-; CHECK-NEXT: [[TMP54:%.*]] = extractelement <2 x i32> [[TMP53]], i32 0
-; CHECK-NEXT: [[TMP55:%.*]] = extractelement <2 x i32> [[TMP53]], i32 1
-; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP54]], [[TMP55]]
-; CHECK-NEXT: [[TMP56:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP34:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT: [[TMP35:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
+; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
+; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
+; CHECK-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP40]], [[TMP39]]
+; CHECK-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
+; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
+; CHECK-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP42]], [[TMP41]]
+; CHECK-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]]
+; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
+; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1
; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; CHECK-NEXT: [[TMP57:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT: [[TMP58:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
-; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP58]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT: [[TMP45:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
+; CHECK-NEXT: [[TMP46:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP45]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32>
+; CHECK-NEXT: [[TMP48:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
+; CHECK-NEXT: [[TMP49:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP48]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
+; CHECK-NEXT: [[TMP51:%.*]] = sub <2 x i32> [[TMP47]], [[TMP50]]
+; CHECK-NEXT: [[TMP52:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
+; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
+; CHECK-NEXT: [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
+; CHECK-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP51]]
+; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
-; CHECK-NEXT: [[TMP61:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
-; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP61]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
-; CHECK-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP60]], [[TMP63]]
-; CHECK-NEXT: [[TMP65:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
+; CHECK-NEXT: [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
+; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
+; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
; CHECK-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
; CHECK-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]]
-; CHECK-NEXT: [[TMP72:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32>
-; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32>
-; CHECK-NEXT: [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]]
-; CHECK-NEXT: [[TMP77:%.*]] = insertelement <2 x i8> poison, i8 [[TMP57]], i32 0
-; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x i8> [[TMP77]], i8 [[TMP56]], i32 1
-; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
-; CHECK-NEXT: [[TMP80:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT: [[TMP81:%.*]] = zext <2 x i8> [[TMP80]] to <2 x i32>
-; CHECK-NEXT: [[TMP82:%.*]] = sub <2 x i32> [[TMP79]], [[TMP81]]
-; CHECK-NEXT: [[TMP83:%.*]] = shl <2 x i32> [[TMP82]], <i32 16, i32 16>
-; CHECK-NEXT: [[TMP84:%.*]] = add <2 x i32> [[TMP83]], [[TMP76]]
-; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP71]], [[TMP84]]
-; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <2 x i32> [[TMP84]], <2 x i32> [[TMP52]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP87:%.*]] = shufflevector <2 x i32> [[TMP71]], <2 x i32> [[TMP36]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP86]], [[TMP87]]
-; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <2 x i32> [[TMP84]], <2 x i32> [[TMP52]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP90:%.*]] = shufflevector <2 x i32> [[TMP71]], <2 x i32> [[TMP36]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP91:%.*]] = add <2 x i32> [[TMP89]], [[TMP90]]
-; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP91]], [[TMP88]]
-; CHECK-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP88]], [[TMP91]]
-; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
-; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
-; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP95]], [[TMP94]]
-; CHECK-NEXT: [[TMP96:%.*]] = extractelement <2 x i32> [[TMP92]], i32 0
-; CHECK-NEXT: [[TMP97:%.*]] = extractelement <2 x i32> [[TMP92]], i32 1
-; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP96]], [[TMP97]]
-; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP97]], [[TMP96]]
-; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1
-; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP98]], 15
+; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
+; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
+; CHECK-NEXT: [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
+; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
+; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
+; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
+; CHECK-NEXT: [[TMP76:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
+; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP76]], 15
; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
; CHECK-NEXT: [[MUL_I:%.*]] = mul...
[truncated]
|
; AVX512-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) | ||
; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] | ||
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] | ||
; AVX512-NEXT: ret i32 [[OP_RDX1]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
regression? AVX512 can more naturally handle 512-bit vectors than AVX2 -
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No. Actually there is no difference between AVX2/AVX512. AVX2 just emits very long vectors, but they will be split into 256 bit vectors. So, when the cost model calculates the costs for AVX2, it operates with 512 vectors, but knows, that they will be split into 2 vectors. And uses this knowledge.
The problem with this test that there is a permutation of 4 vectors. AVX2 sees, that it permutes 2 vectors in 2 different 256bit vector registers, while AVX512 sees that it tries to permute 4 vector register in a single 512bit register. But SLP does not handle more than 2 vector registers permutation yet.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If the graph incudes only single "gather" node with only
extractelements/undefs, which used only in insertelement-based
buildvector sequences, it still might be profitable to vectorize it.
Need to rely on the cost model, not throw this graph away immediately.