Skip to content

Commit 6ed9cef

Browse files
committed
[LV] Scalar with predication must not be uniform
Fix PR40816: avoid considering scalar-with-predication instructions as also uniform-after-vectorization. Instructions identified as "scalar with predication" will be "vectorized" using a replicating region. If such instructions are also optimized as "uniform after vectorization", namely when only the first of VF lanes is used, such a replicating region becomes erroneous - only the first instance of the region can and should be formed. Fix such cases by not considering such instructions as "uniform after vectorization". Differential Revision: https://reviews.llvm.org/D70298
1 parent 96c8024 commit 6ed9cef

File tree

2 files changed

+105
-17
lines changed

2 files changed

+105
-17
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4668,14 +4668,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
46684668
SetVector<Instruction *> Worklist;
46694669
BasicBlock *Latch = TheLoop->getLoopLatch();
46704670

4671+
// Instructions that are scalar with predication must not be considered
4672+
// uniform after vectorization, because that would create an erroneous
4673+
// replicating region where only a single instance out of VF should be formed.
4674+
// TODO: optimize such seldom cases if found important, see PR40816.
4675+
auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4676+
if (isScalarWithPredication(I, VF)) {
4677+
LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4678+
<< *I << "\n");
4679+
return;
4680+
}
4681+
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4682+
Worklist.insert(I);
4683+
};
4684+
46714685
// Start with the conditional branch. If the branch condition is an
46724686
// instruction contained in the loop that is only used by the branch, it is
46734687
// uniform.
46744688
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4675-
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4676-
Worklist.insert(Cmp);
4677-
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4678-
}
4689+
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4690+
addToWorklistIfAllowed(Cmp);
46794691

46804692
// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
46814693
// are pointers that are treated like consecutive pointers during
@@ -4734,10 +4746,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
47344746
// Add to the Worklist all consecutive and consecutive-like pointers that
47354747
// aren't also identified as possibly non-uniform.
47364748
for (auto *V : ConsecutiveLikePtrs)
4737-
if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4738-
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4739-
Worklist.insert(V);
4740-
}
4749+
if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4750+
addToWorklistIfAllowed(V);
47414751

47424752
// Expand Worklist in topological order: whenever a new instruction
47434753
// is added , its users should be already inside Worklist. It ensures
@@ -4763,10 +4773,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
47634773
return Worklist.count(J) ||
47644774
(OI == getLoadStorePointerOperand(J) &&
47654775
isUniformDecision(J, VF));
4766-
})) {
4767-
Worklist.insert(OI);
4768-
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4769-
}
4776+
}))
4777+
addToWorklistIfAllowed(OI);
47704778
}
47714779
}
47724780

@@ -4808,11 +4816,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
48084816
continue;
48094817

48104818
// The induction variable and its update instruction will remain uniform.
4811-
Worklist.insert(Ind);
4812-
Worklist.insert(IndUpdate);
4813-
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4814-
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4815-
<< "\n");
4819+
addToWorklistIfAllowed(Ind);
4820+
addToWorklistIfAllowed(IndUpdate);
48164821
}
48174822

48184823
Uniforms[VF].insert(Worklist.begin(), Worklist.end());

llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; REQUIRES: asserts
22
; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
3+
; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE
34

45
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
56
target triple = "x86_64-unknown-linux-gnu"
@@ -65,3 +66,85 @@ for.end:
6566
}
6667

6768
attributes #0 = { "target-cpu"="knl" }
69+
70+
; CHECK-LABEL: PR40816
71+
;
72+
; Check that scalar with predication instructions are not considered uniform
73+
; after vectorization, because that results in replicating a region instead of
74+
; having a single instance (out of VF). The predication stems from a tiny count
75+
; of 3 leading to folding the tail by masking using icmp ule <i, i+1> <= <2, 2>.
76+
;
77+
; CHECK: LV: Found trip count: 3
78+
; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0
79+
; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, i32* {{%.*}}, align 1
80+
; CHECK: LV: Found not uniform being ScalarWithPredication: {{%.*}} = load i32, i32* {{%.*}}, align 1
81+
; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 {{%.*}}
82+
;
83+
; FORCE-LABEL: @PR40816(
84+
; FORCE-NEXT: entry:
85+
; FORCE-NEXT: br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]]
86+
; FORCE: vector.ph:
87+
; FORCE-NEXT: br label [[VECTOR_BODY:%.*]]
88+
; FORCE: vector.body:
89+
; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
90+
; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ]
91+
; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
92+
; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1
93+
; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2>
94+
; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
95+
; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
96+
; FORCE: pred.store.if:
97+
; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1
98+
; FORCE-NEXT: br label [[PRED_STORE_CONTINUE]]
99+
; FORCE: pred.store.continue:
100+
; FORCE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
101+
; FORCE-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
102+
; FORCE: pred.store.if1:
103+
; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1
104+
; FORCE-NEXT: br label [[PRED_STORE_CONTINUE2]]
105+
; FORCE: pred.store.continue2:
106+
; FORCE-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
107+
; FORCE-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
108+
; FORCE: pred.load.if:
109+
; FORCE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]]
110+
; FORCE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1
111+
; FORCE-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0
112+
; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]]
113+
; FORCE: pred.load.continue:
114+
; FORCE-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ undef, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
115+
; FORCE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
116+
; FORCE-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
117+
; FORCE: pred.load.if3:
118+
; FORCE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]]
119+
; FORCE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1
120+
; FORCE-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
121+
; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE4]]
122+
; FORCE: pred.load.continue4:
123+
; FORCE-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
124+
; FORCE-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
125+
; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
126+
; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
127+
; FORCE-NEXT: br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]]
128+
;
129+
@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
130+
@b = external global i32, align 1
131+
132+
define void @PR40816() #1 {
133+
134+
entry:
135+
br label %for.body
136+
137+
for.body: ; preds = %for.body, %entry
138+
%0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
139+
store i32 %0, i32* @b, align 1
140+
%arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 %0
141+
%1 = load i32, i32* %arrayidx1, align 1
142+
%cmp2 = icmp eq i32 %1, 0
143+
%inc = add nuw nsw i32 %0, 1
144+
br i1 %cmp2, label %return, label %for.body
145+
146+
return: ; preds = %for.body
147+
ret void
148+
}
149+
150+
attributes #1 = { "target-cpu"="core2" }

0 commit comments

Comments
 (0)