Skip to content

Commit 0d748b4

Browse files
[LoopVectorize] Extract the last lane from a uniform store
Changes VPReplicateRecipe to extract the last lane from an unconditional, uniform store instruction. collectLoopUniforms will also add stores to the list of uniform instructions where Legal->isUniformMemOp is true. setCostBasedWideningDecision now sets the widening decision for all uniform memory ops to Scalarize, where previously GatherScatter may have been chosen for scalable stores. This fixes an assert ("Cannot yet scalarize uniform stores") in setCostBasedWideningDecision when we have a loop containing a uniform i1 store and a scalable VF, which we cannot create a scatter for. Reviewed By: sdesmalen, david-arm, fhahn Differential Revision: https://reviews.llvm.org/D112725
1 parent 092cee5 commit 0d748b4

File tree

9 files changed

+189
-135
lines changed

9 files changed

+189
-135
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1770,6 +1770,7 @@ class LoopVectorizationCostModel {
17701770
DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
17711771

17721772
/// Holds the instructions known to be uniform after vectorization.
1773+
/// Entries in Uniforms may demand either the first or last lane.
17731774
/// The data is collected per VF.
17741775
DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
17751776

@@ -5409,9 +5410,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
54095410
assert(WideningDecision != CM_Unknown &&
54105411
"Widening decision should be ready at this moment");
54115412

5412-
// A uniform memory op is itself uniform. We exclude uniform stores
5413-
// here as they demand the last lane, not the first one.
5414-
if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5413+
// A uniform memory op is itself uniform.
5414+
if (Legal->isUniformMemOp(*I)) {
54155415
assert(WideningDecision == CM_Scalarize);
54165416
return true;
54175417
}
@@ -5436,7 +5436,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
54365436
SetVector<Value *> HasUniformUse;
54375437

54385438
// Scan the loop for instructions which are either a) known to have only
5439-
// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5439+
// lane 0 or the last lane demanded or b) are uses which demand only
5440+
// lane 0 of their operand.
54405441
for (auto *BB : TheLoop->blocks())
54415442
for (auto &I : *BB) {
54425443
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
@@ -5468,10 +5469,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
54685469
if (!Ptr)
54695470
continue;
54705471

5471-
// A uniform memory op is itself uniform. We exclude uniform stores
5472-
// here as they demand the last lane, not the first one.
5473-
if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5474-
addToWorklistIfAllowed(&I);
5472+
// A uniform memory op is itself uniform. Load instructions are added
5473+
// to the worklist as they demand the first lane. Since store instructions
5474+
// demand the last lane, we instead add these to Uniforms only.
5475+
if (Legal->isUniformMemOp(I)) {
5476+
if (isa<LoadInst>(I))
5477+
addToWorklistIfAllowed(&I);
5478+
else if (!isOutOfScope(&I) && !isScalarWithPredication(&I))
5479+
Uniforms[VF].insert(&I);
5480+
}
54755481

54765482
if (isUniformDecision(&I, VF)) {
54775483
assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
@@ -7490,17 +7496,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
74907496
// relying on instcombine to remove them.
74917497
// Load: Scalar load + broadcast
74927498
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7493-
InstructionCost Cost;
7494-
if (isa<StoreInst>(&I) && VF.isScalable() &&
7495-
isLegalGatherOrScatter(&I)) {
7496-
Cost = getGatherScatterCost(&I, VF);
7497-
setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7498-
} else {
7499-
assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7500-
"Cannot yet scalarize uniform stores");
7501-
Cost = getUniformMemOpCost(&I, VF);
7502-
setWideningDecision(&I, VF, CM_Scalarize, Cost);
7503-
}
7499+
InstructionCost Cost = getUniformMemOpCost(&I, VF);
7500+
setWideningDecision(&I, VF, CM_Scalarize, Cost);
75047501
continue;
75057502
}
75067503

@@ -9858,6 +9855,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
98589855
return;
98599856
}
98609857

9858+
// If the instruction is a store to a uniform address, we only need to
9859+
// generate the last lane for the last UF part.
9860+
Instruction *I = getUnderlyingInstr();
9861+
if (State.VF.isVector() && IsUniform && isa<StoreInst>(I)) {
9862+
VPLane Lane = VPLane::getLastLaneForVF(State.VF);
9863+
State.ILV->scalarizeInstruction(
9864+
I, this, *this, VPIteration(State.UF - 1, Lane), IsPredicated, State);
9865+
return;
9866+
}
9867+
98619868
// Generate scalar instances for all VF lanes of all UF parts, unless the
98629869
// instruction is uniform inwhich case generate only the first lane for each
98639870
// of the UF parts.
@@ -9866,9 +9873,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
98669873
"Can't scalarize a scalable vector");
98679874
for (unsigned Part = 0; Part < State.UF; ++Part)
98689875
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9869-
State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9870-
VPIteration(Part, Lane), IsPredicated,
9871-
State);
9876+
State.ILV->scalarizeInstruction(I, this, *this, VPIteration(Part, Lane),
9877+
IsPredicated, State);
98729878
}
98739879

98749880
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {

llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll

Lines changed: 122 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,35 @@ target triple = "aarch64-unknown-linux-gnu"
44

55
define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N) #0 {
66
; CHECK-LABEL: @inv_store_i16(
7+
; CHECK-NEXT: entry:
8+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
9+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
10+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
11+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
712
; CHECK: vector.ph:
8-
; CHECK: %[[TMP1:.*]] = insertelement <vscale x 4 x i16*> poison, i16* %dst, i32 0
9-
; CHECK-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector <vscale x 4 x i16*> %[[TMP1]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
13+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
14+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
15+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
16+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
17+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1018
; CHECK: vector.body:
11-
; CHECK: %[[VECLOAD:.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* %{{.*}}, align 2
12-
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> %[[VECLOAD]], <vscale x 4 x i16*> %[[SPLAT_PTRS]], i32 2
19+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
20+
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
21+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[TMP4]]
22+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
23+
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <vscale x 4 x i16>*
24+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP7]], align 2
25+
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
26+
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], 4
27+
; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP9]], 1
28+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP10]]
29+
; CHECK-NEXT: store i16 [[TMP11]], i16* [[DST:%.*]], align 2
30+
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
31+
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
32+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
33+
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
34+
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
35+
;
1336
entry:
1437
br label %for.body14
1538

@@ -59,6 +82,98 @@ for.end: ; preds = %for.inc, %entry
5982
ret void
6083
}
6184

85+
define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) #0 {
86+
; CHECK-LABEL: @uniform_store_i1(
87+
; CHECK-NEXT: entry:
88+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
89+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
90+
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
91+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
92+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
93+
; CHECK: vector.ph:
94+
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
95+
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
96+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
97+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
98+
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[START:%.*]], i64 [[N_VEC]]
99+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64*> poison, i64* [[START]], i32 0
100+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64*> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64*> poison, <vscale x 2 x i32> zeroinitializer
101+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
102+
; CHECK: vector.body:
103+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
104+
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
105+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i32 0
106+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
107+
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP5]]
108+
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP6]]
109+
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[START]], <vscale x 2 x i64> [[TMP7]]
110+
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
111+
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i64, i64* [[START]], i64 [[TMP8]]
112+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1
113+
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i64, i64* [[START]], i64 [[TMP9]]
114+
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
115+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[NEXT_GEP2]], i32 0
116+
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[TMP11]] to <vscale x 2 x i64>*
117+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP12]], align 4
118+
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, <vscale x 2 x i64*> [[NEXT_GEP]], i64 1
119+
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <vscale x 2 x i64*> [[TMP13]], [[BROADCAST_SPLAT]]
120+
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
121+
; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 2
122+
; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1
123+
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 2 x i1> [[TMP14]], i32 [[TMP17]]
124+
; CHECK-NEXT: store i1 [[TMP18]], i1* [[DST:%.*]], align 1
125+
; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
126+
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2
127+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
128+
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
129+
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
130+
;
131+
entry:
132+
br label %for.body
133+
134+
for.body:
135+
%first.sroa = phi i64* [ %incdec.ptr, %for.body ], [ %start, %entry ]
136+
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
137+
%iv.next = add i64 %iv, 1
138+
%0 = load i64, i64* %first.sroa
139+
%incdec.ptr = getelementptr inbounds i64, i64* %first.sroa, i64 1
140+
%cmp.not = icmp eq i64* %incdec.ptr, %start
141+
store i1 %cmp.not, i1* %dst
142+
%cmp = icmp ult i64 %iv, %N
143+
br i1 %cmp, label %for.body, label %end, !llvm.loop !6
144+
145+
end:
146+
ret void
147+
}
148+
149+
; Ensure conditional i1 stores do not vectorize
150+
define void @cond_store_i1(i1* noalias %dst, i8* noalias %start, i32 %cond, i64 %N) #0 {
151+
; CHECK-LABEL: @cond_store_i1(
152+
; CHECK-NOT: vector.body
153+
;
154+
entry:
155+
br label %for.body
156+
157+
for.body:
158+
%first.sroa = phi i8* [ %incdec.ptr, %if.end ], [ null, %entry ]
159+
%incdec.ptr = getelementptr inbounds i8, i8* %first.sroa, i64 1
160+
%0 = load i8, i8* %incdec.ptr
161+
%tobool.not = icmp eq i8 %0, 10
162+
br i1 %tobool.not, label %if.end, label %if.then
163+
164+
if.then:
165+
%cmp.store = icmp eq i8* %start, %incdec.ptr
166+
store i1 %cmp.store, i1* %dst
167+
br label %if.end
168+
169+
if.end:
170+
%cmp.not = icmp eq i8* %incdec.ptr, %start
171+
br i1 %cmp.not, label %for.end, label %for.body
172+
173+
for.end:
174+
ret void
175+
}
176+
62177
attributes #0 = { "target-features"="+neon,+sve" vscale_range(0, 16) }
63178

64179
!0 = distinct !{!0, !1, !2, !3, !4, !5}
@@ -68,3 +183,6 @@ attributes #0 = { "target-features"="+neon,+sve" vscale_range(0, 16) }
68183
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
69184
!5 = !{!"llvm.loop.interleave.count", i32 1}
70185

186+
!6 = distinct !{!6, !1, !7, !3, !4, !5}
187+
!7 = !{!"llvm.loop.vectorize.width", i32 2}
188+

llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: opt -S -loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
2-
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
1+
; RUN: opt -S -loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 -force-vector-width=4 < %s | FileCheck %s
2+
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 -force-vector-width=4 < %s | FileCheck %s
33

44
define i32 @main(i32 %arg, i8** nocapture readnone %arg1) #0 {
55
;CHECK: vector.body:

llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt < %s -loop-vectorize -S | FileCheck %s
2+
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
33

44
; This is a bugpoint reduction of a test from PR43582:
55
; https://bugs.llvm.org/show_bug.cgi?id=43582
@@ -62,25 +62,11 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 {
6262
; CHECK-NEXT: [[TMP23:%.*]] = or <4 x i32> [[TMP19]], zeroinitializer
6363
; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP22]], zeroinitializer
6464
; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP23]], zeroinitializer
65-
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP24]], i32 0
65+
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3
6666
; CHECK-NEXT: store i32 [[TMP26]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]]
67-
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP24]], i32 1
68-
; CHECK-NEXT: store i32 [[TMP27]], i32* undef, align 4, !tbaa [[TBAA4]]
69-
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP24]], i32 2
70-
; CHECK-NEXT: store i32 [[TMP28]], i32* undef, align 4, !tbaa [[TBAA4]]
71-
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3
72-
; CHECK-NEXT: store i32 [[TMP29]], i32* undef, align 4, !tbaa [[TBAA4]]
73-
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP25]], i32 0
74-
; CHECK-NEXT: store i32 [[TMP30]], i32* undef, align 4, !tbaa [[TBAA4]]
75-
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i32 1
76-
; CHECK-NEXT: store i32 [[TMP31]], i32* undef, align 4, !tbaa [[TBAA4]]
77-
; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i32 2
78-
; CHECK-NEXT: store i32 [[TMP32]], i32* undef, align 4, !tbaa [[TBAA4]]
79-
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3
80-
; CHECK-NEXT: store i32 [[TMP33]], i32* undef, align 4, !tbaa [[TBAA4]]
8167
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
82-
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
83-
; CHECK-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
68+
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
69+
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
8470
; CHECK: middle.block:
8571
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
8672
; CHECK-NEXT: br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]]
@@ -91,11 +77,11 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 {
9177
; CHECK-NEXT: [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
9278
; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X]] to i32
9379
; CHECK-NEXT: [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
94-
; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
95-
; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP35]] to i32
80+
; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
81+
; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP28]] to i32
9682
; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
9783
; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
98-
; CHECK-NEXT: [[TMP36:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
84+
; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
9985
; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
10086
; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
10187
; CHECK-NEXT: [[CONV81:%.*]] = zext i8 undef to i32

llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,17 +84,11 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
8484
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
8585
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
8686
; CHECK-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
87-
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
87+
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
8888
; CHECK-NEXT: store i32 [[TMP24]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
89-
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
90-
; CHECK-NEXT: store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
91-
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
92-
; CHECK-NEXT: store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
93-
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
94-
; CHECK-NEXT: store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
9589
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
96-
; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
97-
; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
90+
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
91+
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
9892
; CHECK: middle.block:
9993
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
10094
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]

0 commit comments

Comments
 (0)