Skip to content

Commit 335b5b8

Browse files
committed
[SLP] no need to generate extract for in-tree uses for original scalar instruction.
Before 77a609b, we always skip in-tree uses of the vectorized scalars in `buildExternalUses()`, that commit handles the case that if the in-tree use is scalar operand in vectorized instruction, we need to generate extract for these in-tree uses. in-tree uses remain as scalar in vectorized instructions can be 3 cases: - The pointer operand of vectorized LoadInst uses an in-tree scalar - The pointer operand of vectorized StoreInst uses an in-tree scalar - The scalar argument of vector form intrinsic uses an in-tree scalar Generating extract for in-tree uses for vectorized instructions are implemented in `BoUpSLP::vectorizeTree()`: - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11497-L11506 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11542-L11551 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11657-L11667 However, 77a609b not only generates extract for vectorized instructions, but also generates extract for original scalar instructions. There is no need to generate extract for origin scalar instrutions, as these scalar instructions will be replaced by vector instructions and get erased later. This patch replaces extracts for original scalar instructions with corresponding vectorized instructions, and remove - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11497-L11506 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11542-L11551 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11657-L11667 extracts.
1 parent 66922a5 commit 335b5b8

File tree

3 files changed

+57
-80
lines changed

3 files changed

+57
-80
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 31 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4925,36 +4925,34 @@ void BoUpSLP::buildExternalUses(
49254925
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
49264926

49274927
Instruction *UserInst = dyn_cast<Instruction>(U);
4928-
if (!UserInst)
4928+
if (!UserInst || isDeleted(UserInst))
49294929
continue;
49304930

4931-
if (isDeleted(UserInst))
4931+
// Ignore users in the user ignore list.
4932+
if (UserIgnoreList && UserIgnoreList->contains(UserInst))
49324933
continue;
49334934

49344935
// Skip in-tree scalars that become vectors
49354936
if (TreeEntry *UseEntry = getTreeEntry(U)) {
4936-
Value *UseScalar = UseEntry->Scalars[0];
49374937
// Some in-tree scalars will remain as scalar in vectorized
4938-
// instructions. If that is the case, the one in Lane 0 will
4938+
// instructions. If that is the case, the one in FoundLane will
49394939
// be used.
4940-
if (UseScalar != U ||
4941-
UseEntry->State == TreeEntry::ScatterVectorize ||
4940+
if (UseEntry->State == TreeEntry::ScatterVectorize ||
49424941
UseEntry->State == TreeEntry::PossibleStridedVectorize ||
4943-
!doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
4942+
!doesInTreeUserNeedToExtract(
4943+
Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
49444944
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
49454945
<< ".\n");
49464946
assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
49474947
continue;
49484948
}
4949+
U = nullptr;
49494950
}
49504951

4951-
// Ignore users in the user ignore list.
4952-
if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4953-
continue;
4954-
4955-
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
4956-
<< Lane << " from " << *Scalar << ".\n");
4957-
ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4952+
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
4953+
<< " from lane " << Lane << " from " << *Scalar
4954+
<< ".\n");
4955+
ExternalUses.emplace_back(Scalar, U, FoundLane);
49584956
}
49594957
}
49604958
}
@@ -11493,17 +11491,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1149311491
Value *PO = LI->getPointerOperand();
1149411492
if (E->State == TreeEntry::Vectorize) {
1149511493
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
11496-
11497-
// The pointer operand uses an in-tree scalar so we add the new
11498-
// LoadInst to ExternalUses list to make sure that an extract will
11499-
// be generated in the future.
11500-
if (isa<Instruction>(PO)) {
11501-
if (TreeEntry *Entry = getTreeEntry(PO)) {
11502-
// Find which lane we need to extract.
11503-
unsigned FoundLane = Entry->findLaneForValue(PO);
11504-
ExternalUses.emplace_back(PO, NewLI, FoundLane);
11505-
}
11506-
}
1150711494
} else {
1150811495
assert((E->State == TreeEntry::ScatterVectorize ||
1150911496
E->State == TreeEntry::PossibleStridedVectorize) &&
@@ -11539,17 +11526,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1153911526
StoreInst *ST =
1154011527
Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
1154111528

11542-
// The pointer operand uses an in-tree scalar, so add the new StoreInst to
11543-
// ExternalUses to make sure that an extract will be generated in the
11544-
// future.
11545-
if (isa<Instruction>(Ptr)) {
11546-
if (TreeEntry *Entry = getTreeEntry(Ptr)) {
11547-
// Find which lane we need to extract.
11548-
unsigned FoundLane = Entry->findLaneForValue(Ptr);
11549-
ExternalUses.push_back(ExternalUser(Ptr, ST, FoundLane));
11550-
}
11551-
}
11552-
1155311529
Value *V = propagateMetadata(ST, E->Scalars);
1155411530

1155511531
E->VectorizedValue = V;
@@ -11654,18 +11630,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1165411630
CI->getOperandBundlesAsDefs(OpBundles);
1165511631
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
1165611632

11657-
// The scalar argument uses an in-tree scalar so we add the new vectorized
11658-
// call to ExternalUses list to make sure that an extract will be
11659-
// generated in the future.
11660-
if (isa_and_present<Instruction>(ScalarArg)) {
11661-
if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {
11662-
// Find which lane we need to extract.
11663-
unsigned FoundLane = Entry->findLaneForValue(ScalarArg);
11664-
ExternalUses.push_back(
11665-
ExternalUser(ScalarArg, cast<User>(V), FoundLane));
11666-
}
11667-
}
11668-
1166911633
propagateIRFlags(V, E->Scalars, VL0);
1167011634
V = FinalShuffle(V, E, VecTy, IsSigned);
1167111635

@@ -11877,6 +11841,7 @@ Value *BoUpSLP::vectorizeTree(
1187711841
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
1187811842
SmallDenseSet<Value *, 4> UsedInserts;
1187911843
DenseMap<Value *, Value *> VectorCasts;
11844+
SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
1188011845
// Extract all of the elements with the external uses.
1188111846
for (const auto &ExternalUse : ExternalUses) {
1188211847
Value *Scalar = ExternalUse.Scalar;
@@ -11947,13 +11912,25 @@ Value *BoUpSLP::vectorizeTree(
1194711912
VectorToInsertElement.try_emplace(Vec, IE);
1194811913
return Vec;
1194911914
};
11950-
// If User == nullptr, the Scalar is used as extra arg. Generate
11951-
// ExtractElement instruction and update the record for this scalar in
11952-
// ExternallyUsedValues.
11915+
// If User == nullptr, the Scalar remains as scalar in vectorized
11916+
// instructions or is used as extra arg. Generate ExtractElement instruction
11917+
// and update the record for this scalar in ExternallyUsedValues.
1195311918
if (!User) {
11954-
assert(ExternallyUsedValues.count(Scalar) &&
11955-
"Scalar with nullptr as an external user must be registered in "
11956-
"ExternallyUsedValues map");
11919+
if (!ScalarsWithNullptrUser.insert(Scalar).second)
11920+
continue;
11921+
assert((ExternallyUsedValues.count(Scalar) ||
11922+
any_of(Scalar->users(),
11923+
[this, Scalar](llvm::User *U) {
11924+
TreeEntry *UseEntry = getTreeEntry(U);
11925+
return UseEntry &&
11926+
doesInTreeUserNeedToExtract(
11927+
Scalar,
11928+
cast<Instruction>(UseEntry->Scalars.front()),
11929+
TLI);
11930+
})) &&
11931+
"Scalar with nullptr User must be registered in "
11932+
"ExternallyUsedValues map or remain as scalar in vectorized "
11933+
"instructions");
1195711934
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
1195811935
if (auto *PHI = dyn_cast<PHINode>(VecI))
1195911936
Builder.SetInsertPoint(PHI->getParent(),

llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ define i32 @fn1() {
1111
; CHECK-NEXT: entry:
1212
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr @a, align 8
1313
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
14-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
15-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, <2 x ptr> [[SHUFFLE]], <2 x i64> <i64 11, i64 56>
16-
; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint <2 x ptr> [[TMP2]] to <2 x i64>
17-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
18-
; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
14+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
15+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
16+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
17+
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
18+
; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
1919
; CHECK-NEXT: ret i32 undef
2020
;
2121
entry:
@@ -34,13 +34,13 @@ declare float @llvm.powi.f32.i32(float, i32)
3434
define void @fn2(ptr %a, ptr %b, ptr %c) {
3535
; CHECK-LABEL: @fn2(
3636
; CHECK-NEXT: entry:
37-
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
38-
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
39-
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
40-
; CHECK-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
41-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
42-
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP5]], i32 [[TMP6]])
43-
; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[C:%.*]], align 4
37+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
38+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
39+
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP0]], [[TMP1]]
40+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
41+
; CHECK-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
42+
; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 [[TMP3]])
43+
; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
4444
; CHECK-NEXT: ret void
4545
;
4646
entry:
@@ -90,12 +90,12 @@ define void @externally_used_ptrs() {
9090
; CHECK-NEXT: entry:
9191
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr @a, align 8
9292
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
93-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
94-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, <2 x ptr> [[SHUFFLE]], <2 x i64> <i64 56, i64 11>
95-
; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint <2 x ptr> [[TMP2]] to <2 x i64>
96-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
93+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
94+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
95+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
96+
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
9797
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
98-
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
98+
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]
9999
; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP4]], align 8
100100
; CHECK-NEXT: ret void
101101
;

llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
99
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
1010
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
1111
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
12-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
13-
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
14-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
15-
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP6]], align 4
16-
; CHECK-NEXT: [[TMP10:%.*]] = fmul <8 x float> [[TMP8]], [[TMP9]]
17-
; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x float> [[TMP10]], zeroinitializer
18-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
19-
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
20-
; CHECK-NEXT: store <8 x float> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
12+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
13+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
14+
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
15+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
16+
; CHECK-NEXT: [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4
17+
; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]]
18+
; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer
19+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
20+
; CHECK-NEXT: store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4
2121
; CHECK-NEXT: ret void
2222
;
2323
%3 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8

0 commit comments

Comments
 (0)