Skip to content

Commit a51c2f3

Browse files
authored
[SLP] no need to generate extract for in-tree uses for original scala… (#76077)
…r instruction. Before 77a609b, we always skip in-tree uses of the vectorized scalars in `buildExternalUses()`, that commit handles the case that if the in-tree use is scalar operand in vectorized instruction, we need to generate extract for these in-tree uses. in-tree uses remain as scalar in vectorized instructions can be 3 cases: - The pointer operand of vectorized LoadInst uses an in-tree scalar - The pointer operand of vectorized StoreInst uses an in-tree scalar - The scalar argument of vector form intrinsic uses an in-tree scalar Generating extract for in-tree uses for vectorized instructions are implemented in `BoUpSLP::vectorizeTree()`: - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11497-L11506 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11542-L11551 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11657-L11667 However, 77a609b not only generates extract for vectorized instructions, but also generates extract for original scalar instructions. There is no need to generate extract for origin scalar instrutions, as these scalar instructions will be replaced by vector instructions and get erased later. This patch marks there is no exact user for in-tree scalars that remain as scalar in vectorized instructions when building external uses, In this case all uses of this scalar will be automatically replaced by extractelement. and remove - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11497-L11506 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11542-L11551 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11657-L11667 extracts.
1 parent 3ddf368 commit a51c2f3

File tree

3 files changed

+59
-80
lines changed

3 files changed

+59
-80
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 33 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4925,36 +4925,34 @@ void BoUpSLP::buildExternalUses(
49254925
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
49264926

49274927
Instruction *UserInst = dyn_cast<Instruction>(U);
4928-
if (!UserInst)
4928+
if (!UserInst || isDeleted(UserInst))
49294929
continue;
49304930

4931-
if (isDeleted(UserInst))
4931+
// Ignore users in the user ignore list.
4932+
if (UserIgnoreList && UserIgnoreList->contains(UserInst))
49324933
continue;
49334934

49344935
// Skip in-tree scalars that become vectors
49354936
if (TreeEntry *UseEntry = getTreeEntry(U)) {
4936-
Value *UseScalar = UseEntry->Scalars[0];
49374937
// Some in-tree scalars will remain as scalar in vectorized
4938-
// instructions. If that is the case, the one in Lane 0 will
4938+
// instructions. If that is the case, the one in FoundLane will
49394939
// be used.
4940-
if (UseScalar != U ||
4941-
UseEntry->State == TreeEntry::ScatterVectorize ||
4940+
if (UseEntry->State == TreeEntry::ScatterVectorize ||
49424941
UseEntry->State == TreeEntry::PossibleStridedVectorize ||
4943-
!doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
4942+
!doesInTreeUserNeedToExtract(
4943+
Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
49444944
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
49454945
<< ".\n");
49464946
assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
49474947
continue;
49484948
}
4949+
U = nullptr;
49494950
}
49504951

4951-
// Ignore users in the user ignore list.
4952-
if (UserIgnoreList && UserIgnoreList->contains(UserInst))
4953-
continue;
4954-
4955-
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
4956-
<< Lane << " from " << *Scalar << ".\n");
4957-
ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
4952+
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
4953+
<< " from lane " << Lane << " from " << *Scalar
4954+
<< ".\n");
4955+
ExternalUses.emplace_back(Scalar, U, FoundLane);
49584956
}
49594957
}
49604958
}
@@ -11516,17 +11514,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1151611514
Value *PO = LI->getPointerOperand();
1151711515
if (E->State == TreeEntry::Vectorize) {
1151811516
NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
11519-
11520-
// The pointer operand uses an in-tree scalar so we add the new
11521-
// LoadInst to ExternalUses list to make sure that an extract will
11522-
// be generated in the future.
11523-
if (isa<Instruction>(PO)) {
11524-
if (TreeEntry *Entry = getTreeEntry(PO)) {
11525-
// Find which lane we need to extract.
11526-
unsigned FoundLane = Entry->findLaneForValue(PO);
11527-
ExternalUses.emplace_back(PO, NewLI, FoundLane);
11528-
}
11529-
}
1153011517
} else {
1153111518
assert((E->State == TreeEntry::ScatterVectorize ||
1153211519
E->State == TreeEntry::PossibleStridedVectorize) &&
@@ -11562,17 +11549,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1156211549
StoreInst *ST =
1156311550
Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
1156411551

11565-
// The pointer operand uses an in-tree scalar, so add the new StoreInst to
11566-
// ExternalUses to make sure that an extract will be generated in the
11567-
// future.
11568-
if (isa<Instruction>(Ptr)) {
11569-
if (TreeEntry *Entry = getTreeEntry(Ptr)) {
11570-
// Find which lane we need to extract.
11571-
unsigned FoundLane = Entry->findLaneForValue(Ptr);
11572-
ExternalUses.push_back(ExternalUser(Ptr, ST, FoundLane));
11573-
}
11574-
}
11575-
1157611552
Value *V = propagateMetadata(ST, E->Scalars);
1157711553

1157811554
E->VectorizedValue = V;
@@ -11677,18 +11653,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1167711653
CI->getOperandBundlesAsDefs(OpBundles);
1167811654
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
1167911655

11680-
// The scalar argument uses an in-tree scalar so we add the new vectorized
11681-
// call to ExternalUses list to make sure that an extract will be
11682-
// generated in the future.
11683-
if (isa_and_present<Instruction>(ScalarArg)) {
11684-
if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {
11685-
// Find which lane we need to extract.
11686-
unsigned FoundLane = Entry->findLaneForValue(ScalarArg);
11687-
ExternalUses.push_back(
11688-
ExternalUser(ScalarArg, cast<User>(V), FoundLane));
11689-
}
11690-
}
11691-
1169211656
propagateIRFlags(V, E->Scalars, VL0);
1169311657
V = FinalShuffle(V, E, VecTy, IsSigned);
1169411658

@@ -11900,6 +11864,7 @@ Value *BoUpSLP::vectorizeTree(
1190011864
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
1190111865
SmallDenseSet<Value *, 4> UsedInserts;
1190211866
DenseMap<Value *, Value *> VectorCasts;
11867+
SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
1190311868
// Extract all of the elements with the external uses.
1190411869
for (const auto &ExternalUse : ExternalUses) {
1190511870
Value *Scalar = ExternalUse.Scalar;
@@ -11970,13 +11935,27 @@ Value *BoUpSLP::vectorizeTree(
1197011935
VectorToInsertElement.try_emplace(Vec, IE);
1197111936
return Vec;
1197211937
};
11973-
// If User == nullptr, the Scalar is used as extra arg. Generate
11974-
// ExtractElement instruction and update the record for this scalar in
11975-
// ExternallyUsedValues.
11938+
// If User == nullptr, the Scalar remains as scalar in vectorized
11939+
// instructions or is used as extra arg. Generate ExtractElement instruction
11940+
// and update the record for this scalar in ExternallyUsedValues.
1197611941
if (!User) {
11977-
assert(ExternallyUsedValues.count(Scalar) &&
11978-
"Scalar with nullptr as an external user must be registered in "
11979-
"ExternallyUsedValues map");
11942+
if (!ScalarsWithNullptrUser.insert(Scalar).second)
11943+
continue;
11944+
assert((ExternallyUsedValues.count(Scalar) ||
11945+
any_of(Scalar->users(),
11946+
[&](llvm::User *U) {
11947+
TreeEntry *UseEntry = getTreeEntry(U);
11948+
return UseEntry &&
11949+
UseEntry->State == TreeEntry::Vectorize &&
11950+
E->State == TreeEntry::Vectorize &&
11951+
doesInTreeUserNeedToExtract(
11952+
Scalar,
11953+
cast<Instruction>(UseEntry->Scalars.front()),
11954+
TLI);
11955+
})) &&
11956+
"Scalar with nullptr User must be registered in "
11957+
"ExternallyUsedValues map or remain as scalar in vectorized "
11958+
"instructions");
1198011959
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
1198111960
if (auto *PHI = dyn_cast<PHINode>(VecI))
1198211961
Builder.SetInsertPoint(PHI->getParent(),

llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ define i32 @fn1() {
1111
; CHECK-NEXT: entry:
1212
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr @a, align 8
1313
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
14-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
15-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, <2 x ptr> [[SHUFFLE]], <2 x i64> <i64 11, i64 56>
16-
; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint <2 x ptr> [[TMP2]] to <2 x i64>
17-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
18-
; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
14+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
15+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
16+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
17+
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
18+
; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
1919
; CHECK-NEXT: ret i32 undef
2020
;
2121
entry:
@@ -34,13 +34,13 @@ declare float @llvm.powi.f32.i32(float, i32)
3434
define void @fn2(ptr %a, ptr %b, ptr %c) {
3535
; CHECK-LABEL: @fn2(
3636
; CHECK-NEXT: entry:
37-
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
38-
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
39-
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
40-
; CHECK-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
41-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
42-
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP5]], i32 [[TMP6]])
43-
; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[C:%.*]], align 4
37+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
38+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
39+
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP0]], [[TMP1]]
40+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
41+
; CHECK-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
42+
; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP4]], i32 [[TMP3]])
43+
; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4
4444
; CHECK-NEXT: ret void
4545
;
4646
entry:
@@ -90,12 +90,12 @@ define void @externally_used_ptrs() {
9090
; CHECK-NEXT: entry:
9191
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr @a, align 8
9292
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
93-
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
94-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, <2 x ptr> [[SHUFFLE]], <2 x i64> <i64 56, i64 11>
95-
; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint <2 x ptr> [[TMP2]] to <2 x i64>
96-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
93+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
94+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
95+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
96+
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
9797
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
98-
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
98+
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]
9999
; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP4]], align 8
100100
; CHECK-NEXT: ret void
101101
;

llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
99
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
1010
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
1111
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
12-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
13-
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
14-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
15-
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP6]], align 4
16-
; CHECK-NEXT: [[TMP10:%.*]] = fmul <8 x float> [[TMP8]], [[TMP9]]
17-
; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x float> [[TMP10]], zeroinitializer
18-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
19-
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
20-
; CHECK-NEXT: store <8 x float> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
12+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
13+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
14+
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
15+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
16+
; CHECK-NEXT: [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4
17+
; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]]
18+
; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer
19+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
20+
; CHECK-NEXT: store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4
2121
; CHECK-NEXT: ret void
2222
;
2323
%3 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8

0 commit comments

Comments
 (0)