Skip to content

Commit 77a609b

Browse files
author
Yi Jiang
committed
Generate extract for in-tree uses if the use is scalar operand in vectorized instruction. radar://18144665
llvm-svn: 216946
1 parent bf041d9 commit 77a609b

File tree

2 files changed

+139
-18
lines changed

2 files changed

+139
-18
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 69 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,33 @@ static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
342342
}
343343
}
344344

345+
/// \returns True if in-tree use also needs extract. This refers to
346+
/// possible scalar operand in vectorized instruction.
347+
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
348+
TargetLibraryInfo *TLI) {
349+
350+
unsigned Opcode = UserInst->getOpcode();
351+
switch (Opcode) {
352+
case Instruction::Load: {
353+
LoadInst *LI = cast<LoadInst>(UserInst);
354+
return (LI->getPointerOperand() == Scalar);
355+
}
356+
case Instruction::Store: {
357+
StoreInst *SI = cast<StoreInst>(UserInst);
358+
return (SI->getPointerOperand() == Scalar);
359+
}
360+
case Instruction::Call: {
361+
CallInst *CI = cast<CallInst>(UserInst);
362+
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
363+
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
364+
return (CI->getArgOperand(1) == Scalar);
365+
}
366+
}
367+
default:
368+
return false;
369+
}
370+
}
371+
345372
/// Bottom Up SLP Vectorizer.
346373
class BoUpSLP {
347374
public:
@@ -864,18 +891,27 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
864891
for (User *U : Scalar->users()) {
865892
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
866893

867-
// Skip in-tree scalars that become vectors.
868-
if (ScalarToTreeEntry.count(U)) {
869-
DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
870-
*U << ".\n");
871-
int Idx = ScalarToTreeEntry[U]; (void) Idx;
872-
assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
873-
continue;
874-
}
875894
Instruction *UserInst = dyn_cast<Instruction>(U);
876895
if (!UserInst)
877896
continue;
878897

898+
// Skip in-tree scalars that become vectors
899+
if (ScalarToTreeEntry.count(U)) {
900+
int Idx = ScalarToTreeEntry[U];
901+
TreeEntry *UseEntry = &VectorizableTree[Idx];
902+
Value *UseScalar = UseEntry->Scalars[0];
903+
// Some in-tree scalars will remain as scalar in vectorized
904+
// instructions. If that is the case, the one in Lane 0 will
905+
// be used.
906+
if (UseScalar != U ||
907+
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
908+
DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
909+
<< ".\n");
910+
assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
911+
continue;
912+
}
913+
}
914+
879915
// Ignore users in the user ignore list.
880916
if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
881917
UserIgnoreList.end())
@@ -1190,16 +1226,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
11901226
}
11911227
}
11921228

1193-
// We combine only GEPs with a single use.
1194-
for (unsigned j = 0; j < VL.size(); ++j) {
1195-
if (cast<Instruction>(VL[j])->getNumUses() > 1) {
1196-
DEBUG(dbgs() << "SLP: not-vectorizable GEP (multiple uses).\n");
1197-
BS.cancelScheduling(VL);
1198-
newTreeEntry(VL, false);
1199-
return;
1200-
}
1201-
}
1202-
12031229
// We can't combine several GEPs into one vector if they operate on
12041230
// different types.
12051231
Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
@@ -2023,6 +2049,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
20232049

20242050
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
20252051
VecTy->getPointerTo(AS));
2052+
2053+
// The pointer operand uses an in-tree scalar so we add the new BitCast to
2054+
// ExternalUses list to make sure that an extract will be generated in the
2055+
// future.
2056+
if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2057+
ExternalUses.push_back(
2058+
ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2059+
20262060
unsigned Alignment = LI->getAlignment();
20272061
LI = Builder.CreateLoad(VecPtr);
20282062
if (!Alignment)
@@ -2047,6 +2081,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
20472081
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
20482082
VecTy->getPointerTo(AS));
20492083
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2084+
2085+
// The pointer operand uses an in-tree scalar so we add the new BitCast to
2086+
// ExternalUses list to make sure that an extract will be generated in the
2087+
// future.
2088+
if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2089+
ExternalUses.push_back(
2090+
ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2091+
20502092
if (!Alignment)
20512093
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
20522094
S->setAlignment(Alignment);
@@ -2088,6 +2130,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
20882130
setInsertPointAfterBundle(E->Scalars);
20892131
Function *FI;
20902132
Intrinsic::ID IID = Intrinsic::not_intrinsic;
2133+
Value *ScalarArg = nullptr;
20912134
if (CI && (FI = CI->getCalledFunction())) {
20922135
IID = (Intrinsic::ID) FI->getIntrinsicID();
20932136
}
@@ -2098,6 +2141,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
20982141
// a scalar. This argument should not be vectorized.
20992142
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
21002143
CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2144+
ScalarArg = CEI->getArgOperand(j);
21012145
OpVecs.push_back(CEI->getArgOperand(j));
21022146
continue;
21032147
}
@@ -2116,6 +2160,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
21162160
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
21172161
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
21182162
Value *V = Builder.CreateCall(CF, OpVecs);
2163+
2164+
// The scalar argument uses an in-tree scalar so we add the new vectorized
2165+
// call to ExternalUses list to make sure that an extract will be
2166+
// generated in the future.
2167+
if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2168+
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2169+
21192170
E->VectorizedValue = V;
21202171
++NumVectorInstructions;
21212172
return V;
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx | FileCheck %s
2+
3+
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
4+
5+
@a = common global i64* null, align 8
6+
7+
; Function Attrs: nounwind ssp uwtable
8+
define i32 @fn1() {
9+
entry:
10+
%0 = load i64** @a, align 8
11+
%add.ptr = getelementptr inbounds i64* %0, i64 11
12+
%1 = ptrtoint i64* %add.ptr to i64
13+
store i64 %1, i64* %add.ptr, align 8
14+
%add.ptr1 = getelementptr inbounds i64* %0, i64 56
15+
%2 = ptrtoint i64* %add.ptr1 to i64
16+
%arrayidx2 = getelementptr inbounds i64* %0, i64 12
17+
store i64 %2, i64* %arrayidx2, align 8
18+
ret i32 undef
19+
; CHECK-LABEL: @fn1(
20+
; CHECK: extractelement <2 x i64*>
21+
; CHECK: ret
22+
}
23+
24+
25+
declare float @llvm.powi.f32(float, i32)
26+
define void @fn2(i32* %a, i32* %b, float* %c) {
27+
entry:
28+
%i0 = load i32* %a, align 4
29+
%i1 = load i32* %b, align 4
30+
%add1 = add i32 %i0, %i1
31+
%fp1 = sitofp i32 %add1 to float
32+
%call1 = tail call float @llvm.powi.f32(float %fp1,i32 %add1) nounwind readnone
33+
34+
%arrayidx2 = getelementptr inbounds i32* %a, i32 1
35+
%i2 = load i32* %arrayidx2, align 4
36+
%arrayidx3 = getelementptr inbounds i32* %b, i32 1
37+
%i3 = load i32* %arrayidx3, align 4
38+
%add2 = add i32 %i2, %i3
39+
%fp2 = sitofp i32 %add2 to float
40+
%call2 = tail call float @llvm.powi.f32(float %fp2,i32 %add1) nounwind readnone
41+
42+
%arrayidx4 = getelementptr inbounds i32* %a, i32 2
43+
%i4 = load i32* %arrayidx4, align 4
44+
%arrayidx5 = getelementptr inbounds i32* %b, i32 2
45+
%i5 = load i32* %arrayidx5, align 4
46+
%add3 = add i32 %i4, %i5
47+
%fp3 = sitofp i32 %add3 to float
48+
%call3 = tail call float @llvm.powi.f32(float %fp3,i32 %add1) nounwind readnone
49+
50+
%arrayidx6 = getelementptr inbounds i32* %a, i32 3
51+
%i6 = load i32* %arrayidx6, align 4
52+
%arrayidx7 = getelementptr inbounds i32* %b, i32 3
53+
%i7 = load i32* %arrayidx7, align 4
54+
%add4 = add i32 %i6, %i7
55+
%fp4 = sitofp i32 %add4 to float
56+
%call4 = tail call float @llvm.powi.f32(float %fp4,i32 %add1) nounwind readnone
57+
58+
store float %call1, float* %c, align 4
59+
%arrayidx8 = getelementptr inbounds float* %c, i32 1
60+
store float %call2, float* %arrayidx8, align 4
61+
%arrayidx9 = getelementptr inbounds float* %c, i32 2
62+
store float %call3, float* %arrayidx9, align 4
63+
%arrayidx10 = getelementptr inbounds float* %c, i32 3
64+
store float %call4, float* %arrayidx10, align 4
65+
ret void
66+
67+
; CHECK-LABEL: @fn2(
68+
; CHECK: extractelement <4 x i32>
69+
; CHECK: ret
70+
}

0 commit comments

Comments
 (0)