Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit c7a8ddb

Browse files
committed
[IAI,LV] Avoid creating a scalar epilogue due to gaps in interleave-groups when
optimizing for size LV is careful to respect -Os and not to create a scalar epilog in all cases (runtime tests, trip-counts that require a remainder loop) except for peeling due to gaps in interleave-groups. This patch fixes that; -Os will now have us invalidate such interleave-groups and vectorize without an epilog. The patch also removes a related FIXME comment that is now obsolete, and was also inaccurate: "FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a smaller MaxVF that does not require a scalar epilog." (requiresScalarEpilog() has nothing to do with VF). Reviewers: Ayal, hsaito, dcaballe, fhahn Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53420 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344883 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent e01c86d commit c7a8ddb

File tree

4 files changed

+166
-4
lines changed

4 files changed

+166
-4
lines changed

include/llvm/Analysis/VectorUtils.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,23 @@ class InterleaveGroup {
308308
propagateMetadata(NewInst, VL);
309309
}
310310

311+
/// Returns true if this Group requires a scalar iteration to handle gaps.
312+
bool requiresScalarEpilogue() const {
313+
// If Group has no gaps, or has gaps but the last member exists, then a
314+
// scalar epilog is not needed for this group.
315+
if (getNumMembers() == getFactor() || getMember(getFactor() - 1))
316+
return false;
317+
318+
// We have a group with gaps. It therefore cannot be a group of stores,
319+
// and it can't be a reversed access, because such groups get invalidated.
320+
assert(!getMember(0)->mayWriteToMemory() &&
321+
"Group should have been invalidated");
322+
assert(!isReverse() && "Group should have been invalidated");
323+
324+
// This is a group of loads, with gaps, and without a last-member
325+
return true;
326+
}
327+
311328
private:
312329
unsigned Factor; // Interleave Factor.
313330
bool Reverse;
@@ -388,6 +405,11 @@ class InterleavedAccessInfo {
388405
/// out-of-bounds requires a scalar epilogue iteration for correctness.
389406
bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
390407

408+
/// Invalidate groups that require a scalar epilogue (due to gaps). This can
409+
/// happen when we optimize for size and don't allow creating a scalar
410+
/// epilogue.
411+
void invalidateGroupsRequiringScalarEpilogue();
412+
391413
private:
392414
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
393415
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.

lib/Analysis/VectorUtils.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -919,3 +919,27 @@ void InterleavedAccessInfo::analyzeInterleaving(
919919
}
920920
}
921921
}
922+
923+
void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
924+
// If no group had triggered the requirement to create an epilogue loop,
925+
// there is nothing to do.
926+
if (!requiresScalarEpilogue())
927+
return;
928+
929+
// Avoid releasing a Group twice.
930+
SmallPtrSet<InterleaveGroup *, 4> DelSet;
931+
for (auto &I : InterleaveGroupMap) {
932+
InterleaveGroup *Group = I.second;
933+
if (Group->requiresScalarEpilogue())
934+
DelSet.insert(Group);
935+
}
936+
for (auto *Ptr : DelSet) {
937+
LLVM_DEBUG(
938+
dbgs()
939+
<< "LV: Invalidate candidate interleaved group due to gaps that "
940+
"require a scalar epilogue.\n");
941+
releaseGroup(Ptr);
942+
}
943+
944+
RequiresScalarEpilogue = false;
945+
}

lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4599,6 +4599,14 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
45994599
return None;
46004600
}
46014601

4602+
// Record that scalar epilogue is not allowed.
4603+
LLVM_DEBUG(dbgs() << "LV: Not inserting scalar epilogue for access with gaps "
4604+
"due to -Os/-Oz.\n");
4605+
4606+
// We don't create an epilogue when optimizing for size.
4607+
// Invalidate interleave groups that require an epilogue.
4608+
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4609+
46024610
unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
46034611

46044612
if (TC > 0 && TC % MaxVF == 0) {
@@ -4610,8 +4618,6 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
46104618
// found modulo the vectorization factor is not zero, try to fold the tail
46114619
// by masking.
46124620
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4613-
// FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
4614-
// smaller MaxVF that does not require a scalar epilog.
46154621
if (Legal->canFoldTailByMasking()) {
46164622
FoldTailByMasking = true;
46174623
return MaxVF;

test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll

Lines changed: 112 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
2-
; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
1+
; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
2+
; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
33

44
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
55
target triple = "i386-unknown-linux-gnu"
@@ -9,9 +9,13 @@ target triple = "i386-unknown-linux-gnu"
99
; interleaved-group but rather as a scalarized accesses.
1010
; (For SKX, Gather is not supported by the compiler for chars, therefore
1111
; the only remaining alternative is to scalarize).
12+
; In this case a scalar epilogue is not needed.
13+
;
1214
; When masked-interleave-group is enabled we expect to find the proper mask
1315
; shuffling code, feeding the wide masked load for an interleave-group (with
1416
; a single member).
17+
; Since the last (second) member of the load-group is a gap, peeling is used,
18+
; so we also expect to find a scalar epilogue loop.
1519
;
1620
; void masked_strided1(const unsigned char* restrict p,
1721
; unsigned char* restrict q,
@@ -38,6 +42,8 @@ target triple = "i386-unknown-linux-gnu"
3842
;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
3943
;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
4044
;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
45+
;DISABLED_MASKED_STRIDED-NOT: for.body:
46+
;DISABLED_MASKED_STRIDED: for.end:
4147

4248
;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
4349
;ENABLED_MASKED_STRIDED: vector.body:
@@ -47,6 +53,7 @@ target triple = "i386-unknown-linux-gnu"
4753
;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
4854
;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
4955
;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
56+
;ENABLED_MASKED_STRIDED: for.body:
5057

5158
define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
5259
entry:
@@ -75,6 +82,109 @@ for.end:
7582
ret void
7683
}
7784

85+
; Exactly the same scenario except we are now optimizing for size, therefore
86+
; we check that no scalar epilogue is created. Since we can't create an epilog
87+
; the interleave-group is invalidated because is has gaps, so we end up
88+
; scalarizing.
89+
; (Before the fix that this test checks, we used to create an epilogue despite
90+
; optsize, and vectorized the access as an interleaved-group. This is now fixed,
91+
; and we make sure that a scalar epilogue does not exist).
92+
93+
;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
94+
;ENABLED_MASKED_STRIDED: vector.body:
95+
;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
96+
;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
97+
;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask =
98+
;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
99+
;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
100+
;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
101+
;ENABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
102+
;ENABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
103+
;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask =
104+
;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
105+
;ENABLED_MASKED_STRIDED-NOT: for.body:
106+
;ENABLED_MASKED_STRIDED: for.end:
107+
108+
define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
109+
entry:
110+
%conv = zext i8 %guard to i32
111+
br label %for.body
112+
113+
for.body:
114+
%ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
115+
%cmp1 = icmp ugt i32 %ix.09, %conv
116+
br i1 %cmp1, label %if.then, label %for.inc
117+
118+
if.then:
119+
%mul = shl nuw nsw i32 %ix.09, 1
120+
%arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
121+
%0 = load i8, i8* %arrayidx, align 1
122+
%arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
123+
store i8 %0, i8* %arrayidx3, align 1
124+
br label %for.inc
125+
126+
for.inc:
127+
%inc = add nuw nsw i32 %ix.09, 1
128+
%exitcond = icmp eq i32 %inc, 1024
129+
br i1 %exitcond, label %for.end, label %for.body
130+
131+
for.end:
132+
ret void
133+
}
134+
135+
; Same, but the load/store are not predicated. The interleave-group is
136+
; invalidated here as well because we have gaps and we can't create an epilog.
137+
; The access is thus scalarized.
138+
; (Before the fix that this test checks, we used to create an epilogue despite
139+
; optsize, and vectorized the access as an interleaved-group. This is now fixed,
140+
; and we make sure that a scalar epilogue does not exist).
141+
; Since enable-masked-interleaved-accesses currently only affects predicated
142+
; accesses, the behavior is the same with this switch set/unset.
143+
144+
145+
; void unconditional_strided1_optsize(const unsigned char* restrict p,
146+
; unsigned char* restrict q,
147+
; unsigned char guard) {
148+
; for(ix=0; ix < 1024; ++ix) {
149+
; char t = p[2*ix];
150+
; q[ix] = t;
151+
; }
152+
; }
153+
154+
;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
155+
;DISABLED_MASKED_STRIDED: vector.body:
156+
;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
157+
;DISABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0
158+
;DISABLED_MASKED_STRIDED-NOT: for.body:
159+
;DISABLED_MASKED_STRIDED: for.end:
160+
161+
;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
162+
;ENABLED_MASKED_STRIDED: vector.body:
163+
;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
164+
;ENABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0
165+
;ENABLED_MASKED_STRIDED-NOT: for.body:
166+
;ENABLED_MASKED_STRIDED: for.end:
167+
168+
define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
169+
entry:
170+
br label %for.body
171+
172+
for.body:
173+
%ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
174+
%mul = shl nuw nsw i32 %ix.06, 1
175+
%arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
176+
%0 = load i8, i8* %arrayidx, align 1
177+
%arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
178+
store i8 %0, i8* %arrayidx1, align 1
179+
%inc = add nuw nsw i32 %ix.06, 1
180+
%exitcond = icmp eq i32 %inc, 1024
181+
br i1 %exitcond, label %for.end, label %for.body
182+
183+
for.end:
184+
ret void
185+
}
186+
187+
78188
; Check also a scenario with full interleave-groups (no gaps) as well as both
79189
; load and store groups. We check that when masked-interleave-group is disabled
80190
; the predicated loads (and stores) are not vectorized as an

0 commit comments

Comments
 (0)