Skip to content

Commit 9fa9457

Browse files
committed
[LV] Split checking if tail-folding is possible, collecting masked ops.
Introduce new canFoldTail helper which only checks if tail-folding is possible, but without modifying MaskedOps. Just because tail-folding is possible doesn't mean the tail will be folded; that's up to the cost-model to decide. Separating the check if tail-folding is possible and preparing for tail-folding makes sure that MaskedOps is only populated when tail-folding is actually selected. This allows only creating the header mask if needed after #76635.
1 parent 11f7c89 commit 9fa9457

File tree

4 files changed

+65
-29
lines changed

4 files changed

+65
-29
lines changed

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,9 +276,12 @@ class LoopVectorizationLegality {
276276
bool canVectorizeFPMath(bool EnableStrictReductions);
277277

278278
/// Return true if we can vectorize this loop while folding its tail by
279-
/// masking, and mark all respective loads/stores for masking.
280-
/// This object's state is only modified iff this function returns true.
281-
bool prepareToFoldTailByMasking();
279+
/// masking.
280+
bool canFoldTailByMasking() const;
281+
282+
/// Mark all respective loads/stores for masking. Must only be called when
283+
/// ail-folding is possible.
284+
void prepareToFoldTailByMasking();
282285

283286
/// Returns the primary induction variable.
284287
PHINode *getPrimaryInduction() { return PrimaryInduction; }

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1543,7 +1543,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
15431543
return Result;
15441544
}
15451545

1546-
bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
1546+
bool LoopVectorizationLegality::canFoldTailByMasking() const {
15471547

15481548
LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
15491549

@@ -1601,8 +1601,24 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
16011601

16021602
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
16031603

1604-
MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
16051604
return true;
16061605
}
16071606

1607+
void LoopVectorizationLegality::prepareToFoldTailByMasking() {
1608+
// The list of pointers that we can safely read and write to remains empty.
1609+
SmallPtrSet<Value *, 8> SafePointers;
1610+
1611+
// Collect masked ops in temporary set first to avoid partially populating
1612+
// MaskedOp if a block cannot be predicated.
1613+
SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
1614+
1615+
// Check and mark all blocks for predication, including those that ordinarily
1616+
// do not need predication such as the header block.
1617+
for (BasicBlock *BB : TheLoop->blocks()) {
1618+
bool R = blockCanBePredicated(BB, SafePointers, MaskedOp);
1619+
(void)R;
1620+
assert(R && "Must be able to predicate block when tail-folding.");
1621+
}
1622+
}
1623+
16081624
} // namespace llvm

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1502,7 +1502,7 @@ class LoopVectorizationCostModel {
15021502
/// \param UserIC User specific interleave count.
15031503
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
15041504
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1505-
if (!Legal->prepareToFoldTailByMasking()) {
1505+
if (!Legal->canFoldTailByMasking()) {
15061506
ChosenTailFoldingStyle =
15071507
std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
15081508
return;
@@ -7226,6 +7226,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
72267226
CM.invalidateCostModelingDecisions();
72277227
}
72287228

7229+
if (CM.foldTailByMasking())
7230+
Legal->prepareToFoldTailByMasking();
7231+
72297232
ElementCount MaxUserVF =
72307233
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
72317234
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -96,35 +96,49 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
9696
;
9797
; NO-VP-LABEL: @interleave(
9898
; NO-VP-NEXT: entry:
99-
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
99+
; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
100+
; NO-VP-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
101+
; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
100102
; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
101103
; NO-VP: vector.ph:
102-
; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
104+
; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
105+
; NO-VP-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
106+
; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
103107
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
108+
; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
109+
; NO-VP-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
104110
; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]]
105111
; NO-VP: vector.body:
106112
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
107-
; NO-VP-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
108-
; NO-VP-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
109-
; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0
110-
; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0
111-
; NO-VP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
112-
; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
113-
; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
114-
; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
115-
; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
116-
; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
117-
; NO-VP-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
118-
; NO-VP-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
119-
; NO-VP-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
120-
; NO-VP-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
121-
; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
122-
; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
123-
; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
124-
; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8
125-
; NO-VP-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4
126-
; NO-VP-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4
127-
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
113+
; NO-VP-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
114+
; NO-VP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
115+
; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
116+
; NO-VP-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0
117+
; NO-VP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
118+
; NO-VP-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
119+
; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP6]], i32 0
120+
; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP11]], i32 0
121+
; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
122+
; NO-VP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
123+
; NO-VP-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
124+
; NO-VP-NEXT: [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
125+
; NO-VP-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
126+
; NO-VP-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
127+
; NO-VP-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
128+
; NO-VP-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
129+
; NO-VP-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
130+
; NO-VP-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
131+
; NO-VP-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP17]], [[TMP16]]
132+
; NO-VP-NEXT: [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]]
133+
; NO-VP-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
134+
; NO-VP-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
135+
; NO-VP-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
136+
; NO-VP-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
137+
; NO-VP-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
138+
; NO-VP-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i64 [[TMP26]]
139+
; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
140+
; NO-VP-NEXT: store <vscale x 4 x i32> [[TMP21]], ptr [[TMP27]], align 4
141+
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
128142
; NO-VP-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
129143
; NO-VP-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
130144
; NO-VP: middle.block:

0 commit comments

Comments
 (0)