Skip to content

Commit f8c9f75

Browse files
committed
[SLP] no need to generate extract for in-tree uses for original scalar instruction.
Before 77a609b, we always skip in-tree uses of the vectorized scalars in `buildExternalUses()`, that commit handle the case that if the in-tree use is scalar operand in vectorized instruction, we need to generate extract for these in-tree uses. in-tree uses remain as scalar in vectorized instructions can be 3 cases: - The pointer operand of vectorized LoadInst uses an in-tree scalar - The pointer operand of vectorized StoreInst uses an in-tree scalar - The scalar argument of vector form intrinsic uses an in-tree scalar Generating extract for in-tree uses for vectorized instructions are implemented in BoUpSLP::vectorizeTree(): - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11497-L11506 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11542-L11551 - https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp#L11657-L11667 However, 77a609b not only generates extract for vectorized instructions, but also generate extract for original scalar instructions. There is no need to generate extract for origin scalar instrutions, as these scalar instructions will be replaced by vector instructions and get erased later. Extract for origin scalar instrutions are also generated because in `BoUpSLP::buildExternalUses()` if `doesInTreeUserNeedToExtract()` return true, <in-tree scalar, scalar instruction use in-tree scalar> will be pushed to `ExternalUses`. To omit generating extract for original scalar instruction, this patch remove `doesInTreeUserNeedToExtract()` check, and fold the follwing if expression to always true. ``` if (UseScalar != U || UseEntry->State == TreeEntry::ScatterVectorize || UseEntry->State == TreeEntry::PossibleStridedVectorize || !doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) ``` With this change, it is also more likely to be profitable to vectorize since we remove the unneed entries in `ExternalUses` and get less extraction cost. E.g. the llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll testcase is updated as the `test2()` function is successfully vectorized with this change.
1 parent 2c257cf commit f8c9f75

File tree

2 files changed

+12
-51
lines changed

2 files changed

+12
-51
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 4 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -771,33 +771,6 @@ static bool allSameType(ArrayRef<Value *> VL) {
771771
return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
772772
}
773773

774-
/// \returns True if in-tree use also needs extract. This refers to
775-
/// possible scalar operand in vectorized instruction.
776-
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
777-
TargetLibraryInfo *TLI) {
778-
unsigned Opcode = UserInst->getOpcode();
779-
switch (Opcode) {
780-
case Instruction::Load: {
781-
LoadInst *LI = cast<LoadInst>(UserInst);
782-
return (LI->getPointerOperand() == Scalar);
783-
}
784-
case Instruction::Store: {
785-
StoreInst *SI = cast<StoreInst>(UserInst);
786-
return (SI->getPointerOperand() == Scalar);
787-
}
788-
case Instruction::Call: {
789-
CallInst *CI = cast<CallInst>(UserInst);
790-
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
791-
return any_of(enumerate(CI->args()), [&](auto &&Arg) {
792-
return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
793-
Arg.value().get() == Scalar;
794-
});
795-
}
796-
default:
797-
return false;
798-
}
799-
}
800-
801774
/// \returns the AA location that is being access by the instruction.
802775
static MemoryLocation getLocation(Instruction *I) {
803776
if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -4933,19 +4906,10 @@ void BoUpSLP::buildExternalUses(
49334906

49344907
// Skip in-tree scalars that become vectors
49354908
if (TreeEntry *UseEntry = getTreeEntry(U)) {
4936-
Value *UseScalar = UseEntry->Scalars[0];
4937-
// Some in-tree scalars will remain as scalar in vectorized
4938-
// instructions. If that is the case, the one in Lane 0 will
4939-
// be used.
4940-
if (UseScalar != U ||
4941-
UseEntry->State == TreeEntry::ScatterVectorize ||
4942-
UseEntry->State == TreeEntry::PossibleStridedVectorize ||
4943-
!doesInTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
4944-
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
4945-
<< ".\n");
4946-
assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
4947-
continue;
4948-
}
4909+
LLVM_DEBUG(dbgs()
4910+
<< "SLP: \tInternal user will be removed:" << *U << ".\n");
4911+
assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
4912+
continue;
49494913
}
49504914

49514915
// Ignore users in the user ignore list.

llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,14 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {
5252

5353
define void @test2(i64* %a, i64* %b) {
5454
; CHECK-LABEL: @test2(
55-
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
56-
; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
57-
; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
58-
; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
59-
; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
60-
; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A1]], align 8
61-
; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8
62-
; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
63-
; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
64-
; CHECK-NEXT: store i64 [[ADD1]], ptr [[A1]], align 8
65-
; CHECK-NEXT: store i64 [[ADD2]], ptr [[A2]], align 8
55+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
56+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
57+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
58+
; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
59+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
60+
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
61+
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP4]], [[TMP6]]
62+
; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
6663
; CHECK-NEXT: ret void
6764
;
6865
%a1 = getelementptr inbounds i64, i64* %a, i64 1

0 commit comments

Comments
 (0)