Skip to content

Commit 961f51f

Browse files
committed
[LoopVectorize][CostModel] Choose smaller VFs for in-loop reductions without loads/stores
For loops that contain in-loop reductions but no loads or stores, large VFs are chosen because LoopVectorizationCostModel::getSmallestAndWidestTypes has no element types to check through and so returns the default widths (-1U for the smallest and 8 for the widest). This results in the widest VF being chosen for the following example, float s = 0; for (int i = 0; i < N; ++i) s += (float) i*i; which, for more computationally intensive loops, leads to large loop sizes when the operations end up being scalarized. In this patch, for the case where ElementTypesInLoop is empty, the widest type is determined by finding the smallest type used by recurrences in the loop instead of falling back to a default value of 8 bits. This results in the cost model choosing a more sensible VF for loops like the one above. Differential Revision: https://reviews.llvm.org/D113973
1 parent c9dbf0f commit 961f51f

File tree

5 files changed

+144
-29
lines changed

5 files changed

+144
-29
lines changed

llvm/include/llvm/Analysis/IVDescriptors.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,12 @@ class RecurrenceDescriptor {
7777
RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
7878
FastMathFlags FMF, Instruction *ExactFP, Type *RT,
7979
bool Signed, bool Ordered,
80-
SmallPtrSetImpl<Instruction *> &CI)
80+
SmallPtrSetImpl<Instruction *> &CI,
81+
unsigned MinWidthCastToRecurTy)
8182
: StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
8283
ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
83-
IsOrdered(Ordered) {
84+
IsOrdered(Ordered),
85+
MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
8486
CastInsts.insert(CI.begin(), CI.end());
8587
}
8688

@@ -251,6 +253,11 @@ class RecurrenceDescriptor {
251253
/// recurrence.
252254
const SmallPtrSet<Instruction *, 8> &getCastInsts() const { return CastInsts; }
253255

256+
/// Returns the minimum width used by the recurrence in bits.
257+
unsigned getMinWidthCastToRecurrenceTypeInBits() const {
258+
return MinWidthCastToRecurrenceType;
259+
}
260+
254261
/// Returns true if all source operands of the recurrence are SExtInsts.
255262
bool isSigned() const { return IsSigned; }
256263

@@ -291,6 +298,8 @@ class RecurrenceDescriptor {
291298
bool IsOrdered = false;
292299
// Instructions used for type-promoting the recurrence.
293300
SmallPtrSet<Instruction *, 8> CastInsts;
301+
// The minimum width used by the recurrence.
302+
unsigned MinWidthCastToRecurrenceType;
294303
};
295304

296305
/// A struct for saving information about induction variables.

llvm/lib/Analysis/IVDescriptors.cpp

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -161,27 +161,39 @@ static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
161161

162162
/// Collect cast instructions that can be ignored in the vectorizer's cost
163163
/// model, given a reduction exit value and the minimal type in which the
164-
/// reduction can be represented.
165-
static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
166-
Type *RecurrenceType,
167-
SmallPtrSetImpl<Instruction *> &Casts) {
164+
// reduction can be represented. Also search casts to the recurrence type
165+
// to find the minimum width used by the recurrence.
166+
static void collectCastInstrs(Loop *TheLoop, Instruction *Exit,
167+
Type *RecurrenceType,
168+
SmallPtrSetImpl<Instruction *> &Casts,
169+
unsigned &MinWidthCastToRecurTy) {
168170

169171
SmallVector<Instruction *, 8> Worklist;
170172
SmallPtrSet<Instruction *, 8> Visited;
171173
Worklist.push_back(Exit);
174+
MinWidthCastToRecurTy = -1U;
172175

173176
while (!Worklist.empty()) {
174177
Instruction *Val = Worklist.pop_back_val();
175178
Visited.insert(Val);
176-
if (auto *Cast = dyn_cast<CastInst>(Val))
179+
if (auto *Cast = dyn_cast<CastInst>(Val)) {
177180
if (Cast->getSrcTy() == RecurrenceType) {
178181
// If the source type of a cast instruction is equal to the recurrence
179182
// type, it will be eliminated, and should be ignored in the vectorizer
180183
// cost model.
181184
Casts.insert(Cast);
182185
continue;
183186
}
184-
187+
if (Cast->getDestTy() == RecurrenceType) {
188+
// The minimum width used by the recurrence is found by checking for
189+
// casts on its operands. The minimum width is used by the vectorizer
190+
// when finding the widest type for in-loop reductions without any
191+
// loads/stores.
192+
MinWidthCastToRecurTy = std::min<unsigned>(
193+
MinWidthCastToRecurTy, Cast->getSrcTy()->getScalarSizeInBits());
194+
continue;
195+
}
196+
}
185197
// Add all operands to the work list if they are loop-varying values that
186198
// we haven't yet visited.
187199
for (Value *O : cast<User>(Val)->operands())
@@ -265,6 +277,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
265277
// Data used for determining if the recurrence has been type-promoted.
266278
Type *RecurrenceType = Phi->getType();
267279
SmallPtrSet<Instruction *, 4> CastInsts;
280+
unsigned MinWidthCastToRecurrenceType;
268281
Instruction *Start = Phi;
269282
bool IsSigned = false;
270283

@@ -500,21 +513,24 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
500513
computeRecurrenceType(ExitInstruction, DB, AC, DT);
501514
if (ComputedType != RecurrenceType)
502515
return false;
503-
504-
// The recurrence expression will be represented in a narrower type. If
505-
// there are any cast instructions that will be unnecessary, collect them
506-
// in CastInsts. Note that the 'and' instruction was already included in
507-
// this list.
508-
//
509-
// TODO: A better way to represent this may be to tag in some way all the
510-
// instructions that are a part of the reduction. The vectorizer cost
511-
// model could then apply the recurrence type to these instructions,
512-
// without needing a white list of instructions to ignore.
513-
// This may also be useful for the inloop reductions, if it can be
514-
// kept simple enough.
515-
collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts);
516516
}
517517

518+
// Collect cast instructions and the minimum width used by the recurrence.
519+
// If the starting value is not the same as the phi node and the computed
520+
// recurrence type is equal to the recurrence type, the recurrence expression
521+
// will be represented in a narrower or wider type. If there are any cast
522+
// instructions that will be unnecessary, collect them in CastsFromRecurTy.
523+
// Note that the 'and' instruction was already included in this list.
524+
//
525+
// TODO: A better way to represent this may be to tag in some way all the
526+
// instructions that are a part of the reduction. The vectorizer cost
527+
// model could then apply the recurrence type to these instructions,
528+
// without needing a white list of instructions to ignore.
529+
// This may also be useful for the inloop reductions, if it can be
530+
// kept simple enough.
531+
collectCastInstrs(TheLoop, ExitInstruction, RecurrenceType, CastInsts,
532+
MinWidthCastToRecurrenceType);
533+
518534
// We found a reduction var if we have reached the original phi node and we
519535
// only have a single instruction with out-of-loop users.
520536

@@ -524,7 +540,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
524540
// Save the description of this reduction variable.
525541
RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
526542
ReduxDesc.getExactFPMathInst(), RecurrenceType,
527-
IsSigned, IsOrdered, CastInsts);
543+
IsSigned, IsOrdered, CastInsts,
544+
MinWidthCastToRecurrenceType);
528545
RedDes = RD;
529546

530547
return true;

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5960,11 +5960,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
59605960
unsigned MinWidth = -1U;
59615961
unsigned MaxWidth = 8;
59625962
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5963-
for (Type *T : ElementTypesInLoop) {
5964-
MinWidth = std::min<unsigned>(
5965-
MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5966-
MaxWidth = std::max<unsigned>(
5967-
MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5963+
// For in-loop reductions, no element types are added to ElementTypesInLoop
5964+
// if there are no loads/stores in the loop. In this case, check through the
5965+
// reduction variables to determine the maximum width.
5966+
if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5967+
// Reset MaxWidth so that we can find the smallest type used by recurrences
5968+
// in the loop.
5969+
MaxWidth = -1U;
5970+
for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5971+
const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5972+
// When finding the min width used by the recurrence we need to account
5973+
// for casts on the input operands of the recurrence.
5974+
MaxWidth = std::min<unsigned>(
5975+
MaxWidth, std::min<unsigned>(
5976+
RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5977+
RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5978+
}
5979+
} else {
5980+
for (Type *T : ElementTypesInLoop) {
5981+
MinWidth = std::min<unsigned>(
5982+
MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5983+
MaxWidth = std::max<unsigned>(
5984+
MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5985+
}
59685986
}
59695987
return {MinWidth, MaxWidth};
59705988
}

llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; REQUIRES: asserts
2-
; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
2+
; RUN: opt < %s -loop-vectorize -force-target-instruction-cost=1 -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
33

44
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
55
target triple = "aarch64--linux-gnu"
@@ -31,3 +31,74 @@ for.body:
3131
for.end:
3232
ret void
3333
}
34+
35+
; For in-loop reductions with no loads or stores in the loop the widest type is
36+
; determined by looking through the recurrences, which allows a sensible VF to be
37+
; chosen. The following 3 cases check different combinations of widths.
38+
39+
; CHECK-LABEL: Checking a loop in "no_loads_stores_32"
40+
; CHECK: The Smallest and Widest types: 4294967295 / 32 bits
41+
; CHECK: Selecting VF: 4
42+
43+
define double @no_loads_stores_32(i32 %n) {
44+
entry:
45+
br label %for.body
46+
47+
for.body:
48+
%s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
49+
%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
50+
%conv = sitofp i32 %i.08 to float
51+
%conv1 = fpext float %conv to double
52+
%add = fadd double %s.09, %conv1
53+
%inc = add nuw i32 %i.08, 1
54+
%exitcond.not = icmp eq i32 %inc, %n
55+
br i1 %exitcond.not, label %for.end, label %for.body
56+
57+
for.end:
58+
%.lcssa = phi double [ %add, %for.body ]
59+
ret double %.lcssa
60+
}
61+
62+
; CHECK-LABEL: Checking a loop in "no_loads_stores_16"
63+
; CHECK: The Smallest and Widest types: 4294967295 / 16 bits
64+
; CHECK: Selecting VF: 8
65+
66+
define double @no_loads_stores_16() {
67+
entry:
68+
br label %for.body
69+
70+
for.body:
71+
%s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
72+
%i.08 = phi i16 [ 0, %entry ], [ %inc, %for.body ]
73+
%conv = sitofp i16 %i.08 to double
74+
%add = fadd double %s.09, %conv
75+
%inc = add nuw nsw i16 %i.08, 1
76+
%exitcond.not = icmp eq i16 %inc, 12345
77+
br i1 %exitcond.not, label %for.end, label %for.body
78+
79+
for.end:
80+
%.lcssa = phi double [ %add, %for.body ]
81+
ret double %.lcssa
82+
}
83+
84+
; CHECK-LABEL: Checking a loop in "no_loads_stores_8"
85+
; CHECK: The Smallest and Widest types: 4294967295 / 8 bits
86+
; CHECK: Selecting VF: 16
87+
88+
define float @no_loads_stores_8() {
89+
entry:
90+
br label %for.body
91+
92+
for.body:
93+
%s.09 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
94+
%i.08 = phi i8 [ 0, %entry ], [ %inc, %for.body ]
95+
%conv = sitofp i8 %i.08 to float
96+
%add = fadd float %s.09, %conv
97+
%inc = add nuw nsw i8 %i.08, 1
98+
%exitcond.not = icmp eq i8 %inc, 12345
99+
br i1 %exitcond.not, label %for.end, label %for.body
100+
101+
for.end:
102+
%.lcssa = phi float [ %add, %for.body ]
103+
ret float %.lcssa
104+
}

llvm/test/Transforms/LoopVectorize/X86/funclet.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ unreachable: ; preds = %entry
3333

3434
; CHECK-LABEL: define void @test1(
3535
; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
36-
; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
36+
; CHECK: call <8 x double> @llvm.floor.v8f64(<8 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
3737

3838
declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)
3939

0 commit comments

Comments
 (0)