Skip to content

Commit 6673e56

Browse files
committed
[LoopVectorize] Add cost of generating tail-folding mask to the loop
At the moment if we decide to enable tail-folding we do not include the cost of generating the mask per VF. This can mean we make some poor choices of VF, which is definitely true for SVE-enabled AArch64 targets where mask generation for fixed-width vectors is more expensive than for scalable vectors. New tests added: Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
1 parent c1bd688 commit 6673e56

File tree

6 files changed

+119
-50
lines changed

6 files changed

+119
-50
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1604,6 +1604,9 @@ class LoopVectorizationCostModel {
16041604
/// vectorization has actually taken place).
16051605
using VectorizationCostTy = std::pair<InstructionCost, bool>;
16061606

1607+
/// Calculate the cost of generating the mask when tail-folding for the VF.
1608+
InstructionCost getTailFoldMaskCost(ElementCount VF);
1609+
16071610
/// Returns the expected execution cost. The unit of the cost does
16081611
/// not matter because we use the 'cost' units to compare different
16091612
/// vector widths. The cost that is returned is *not* normalized by
@@ -5942,6 +5945,35 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
59425945
return Discount;
59435946
}
59445947

5948+
InstructionCost
5949+
LoopVectorizationCostModel::getTailFoldMaskCost(ElementCount VF) {
5950+
if (VF.isScalar())
5951+
return 0;
5952+
5953+
InstructionCost MaskCost;
5954+
Type *IndTy = Legal->getWidestInductionType();
5955+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5956+
TailFoldingStyle Style = getTailFoldingStyle();
5957+
LLVMContext &Context = TheLoop->getHeader()->getContext();
5958+
VectorType *RetTy = VectorType::get(IntegerType::getInt1Ty(Context), VF);
5959+
if (useActiveLaneMask(Style)) {
5960+
IntrinsicCostAttributes Attrs(
5961+
Intrinsic::get_active_lane_mask, RetTy,
5962+
{PoisonValue::get(IndTy), PoisonValue::get(IndTy)});
5963+
MaskCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
5964+
} else {
5965+
// This is just a stepvector, added to a splat of the current IV, followed
5966+
// by a vector comparison with a splat of the trip count. Since the
5967+
// stepvector is loop invariant it will be hoisted out so we can ignore it.
5968+
// This just leaves us with an add and an icmp.
5969+
VectorType *VecTy = VectorType::get(IndTy, VF);
5970+
MaskCost = TTI.getArithmeticInstrCost(Instruction::Add, VecTy, CostKind);
5971+
MaskCost += TTI.getCmpSelInstrCost(Instruction::ICmp, VecTy, RetTy,
5972+
ICmpInst::ICMP_ULE, CostKind, nullptr);
5973+
}
5974+
return MaskCost;
5975+
}
5976+
59455977
LoopVectorizationCostModel::VectorizationCostTy
59465978
LoopVectorizationCostModel::expectedCost(
59475979
ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
@@ -5990,6 +6022,15 @@ LoopVectorizationCostModel::expectedCost(
59906022
Cost.second |= BlockCost.second;
59916023
}
59926024

6025+
// If we're using tail-folding then we should add the cost of generating the
6026+
// mask.
6027+
if (foldTailByMasking()) {
6028+
InstructionCost MaskCost = getTailFoldMaskCost(VF);
6029+
LLVM_DEBUG(dbgs() << "LV: Adding cost of generating tail-fold mask for VF "
6030+
<< VF << ": " << MaskCost << '\n');
6031+
Cost.first += MaskCost;
6032+
}
6033+
59936034
return Cost;
59946035
}
59956036

llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue <%s | FileCheck %s
1+
; REQUIRES: asserts
2+
; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
3+
; RUN: -debug-only=loop-vectorize <%s 2>%t | FileCheck %s
4+
; RUN: cat %t | FileCheck %s --check-prefix=COST
25

36
target triple = "aarch64-unknown-linux-gnu"
47

@@ -32,4 +35,29 @@ for.end: ; preds = %for.body
3235
ret i32 0
3336
}
3437

38+
39+
; COST: LV: Checking a loop in 'simple_memset'
40+
; COST: LV: Adding cost of generating tail-fold mask for VF 1: 0
41+
; COST: LV: Adding cost of generating tail-fold mask for VF 2: 4
42+
; COST: LV: Adding cost of generating tail-fold mask for VF 4: 8
43+
; COST: LV: Adding cost of generating tail-fold mask for VF vscale x 1: 4
44+
; COST: LV: Adding cost of generating tail-fold mask for VF vscale x 2: 1
45+
; COST: LV: Adding cost of generating tail-fold mask for VF vscale x 4: 1
46+
47+
define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
48+
entry:
49+
br label %while.body
50+
51+
while.body: ; preds = %while.body, %entry
52+
%index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
53+
%gep = getelementptr i32, ptr %ptr, i64 %index
54+
store i32 %val, ptr %gep
55+
%index.next = add nsw i64 %index, 1
56+
%cmp10 = icmp ult i64 %index.next, %n
57+
br i1 %cmp10, label %while.body, label %while.end.loopexit
58+
59+
while.end.loopexit: ; preds = %while.body
60+
ret void
61+
}
62+
3563
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ define void @pred_loop(ptr %off, ptr %data, ptr %dst, i32 %n) #0 {
1515
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add1, ptr %arrayidx2, align 4
1616
; CHECK-COST-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %exitcond.not = icmp eq i32 %add, %n
1717
; CHECK-COST-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: br i1 %exitcond.not, label %exit.loopexit, label %for.body
18+
; CHECK-COST-NEXT: LV: Adding cost of generating tail-fold mask: 0
1819
; CHECK-COST-NEXT: LV: Scalar loop costs: 5.
1920

2021
entry:
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
; REQUIRES: asserts
2+
; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
3+
; RUN: -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s
4+
5+
; CHECK: LV: Adding cost of generating tail-fold mask for VF 1: 0
6+
; CHECK: LV: Adding cost of generating tail-fold mask for VF 2: 2
7+
; CHECK: LV: Adding cost of generating tail-fold mask for VF 4: 4
8+
; CHECK: LV: Adding cost of generating tail-fold mask for VF 8: 8
9+
; CHECK: LV: Adding cost of generating tail-fold mask for VF vscale x 1: 2
10+
; CHECK: LV: Adding cost of generating tail-fold mask for VF vscale x 2: 4
11+
; CHECK: LV: Adding cost of generating tail-fold mask for VF vscale x 4: 8
12+
13+
define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
14+
entry:
15+
br label %while.body
16+
17+
while.body: ; preds = %while.body, %entry
18+
%index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
19+
%gep = getelementptr i32, ptr %ptr, i64 %index
20+
store i32 %val, ptr %gep
21+
%index.next = add nsw i64 %index, 1
22+
%cmp10 = icmp ult i64 %index.next, %n
23+
br i1 %cmp10, label %while.body, label %while.end.loopexit
24+
25+
while.end.loopexit: ; preds = %while.body
26+
ret void
27+
}

llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -497,49 +497,21 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
497497
;
498498
; TF-FIXEDLEN-LABEL: @conditional_uniform_load(
499499
; TF-FIXEDLEN-NEXT: entry:
500-
; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
501-
; TF-FIXEDLEN: vector.ph:
502-
; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0
503-
; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
504-
; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]]
505-
; TF-FIXEDLEN: vector.body:
506-
; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
507-
; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
508-
; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
509-
; TF-FIXEDLEN-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 1025)
510-
; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
511-
; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
512-
; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
513-
; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
514-
; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
515-
; TF-FIXEDLEN-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP2]], [[TMP4]]
516-
; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
517-
; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
518-
; TF-FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
519-
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[TMP6]])
520-
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
521-
; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
522-
; TF-FIXEDLEN-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
523-
; TF-FIXEDLEN-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
524-
; TF-FIXEDLEN: middle.block:
525-
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
526-
; TF-FIXEDLEN: scalar.ph:
527-
; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
528500
; TF-FIXEDLEN-NEXT: br label [[FOR_BODY:%.*]]
529501
; TF-FIXEDLEN: for.body:
530-
; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
502+
; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
531503
; TF-FIXEDLEN-NEXT: [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
532504
; TF-FIXEDLEN-NEXT: br i1 [[CMP]], label [[DO_LOAD:%.*]], label [[LATCH]]
533505
; TF-FIXEDLEN: do_load:
534-
; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
506+
; TF-FIXEDLEN-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
535507
; TF-FIXEDLEN-NEXT: br label [[LATCH]]
536508
; TF-FIXEDLEN: latch:
537509
; TF-FIXEDLEN-NEXT: [[PHI:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[V]], [[DO_LOAD]] ]
538-
; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
510+
; TF-FIXEDLEN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
539511
; TF-FIXEDLEN-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
540512
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
541513
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
542-
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
514+
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
543515
; TF-FIXEDLEN: for.end:
544516
; TF-FIXEDLEN-NEXT: ret void
545517
;
@@ -708,7 +680,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
708680
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
709681
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
710682
; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
711-
; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
683+
; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
712684
; TF-FIXEDLEN: middle.block:
713685
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
714686
; TF-FIXEDLEN: scalar.ph:
@@ -721,7 +693,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
721693
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
722694
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
723695
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
724-
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
696+
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
725697
; TF-FIXEDLEN: for.end:
726698
; TF-FIXEDLEN-NEXT: ret void
727699
;
@@ -883,7 +855,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
883855
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
884856
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
885857
; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
886-
; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
858+
; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
887859
; TF-FIXEDLEN: middle.block:
888860
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
889861
; TF-FIXEDLEN: scalar.ph:
@@ -896,7 +868,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
896868
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
897869
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
898870
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
899-
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
871+
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
900872
; TF-FIXEDLEN: for.end:
901873
; TF-FIXEDLEN-NEXT: ret void
902874
;
@@ -1114,7 +1086,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
11141086
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
11151087
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
11161088
; TF-FIXEDLEN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
1117-
; TF-FIXEDLEN-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
1089+
; TF-FIXEDLEN-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
11181090
; TF-FIXEDLEN: middle.block:
11191091
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
11201092
; TF-FIXEDLEN: scalar.ph:
@@ -1127,7 +1099,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
11271099
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
11281100
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
11291101
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
1130-
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
1102+
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
11311103
; TF-FIXEDLEN: for.end:
11321104
; TF-FIXEDLEN-NEXT: ret void
11331105
;
@@ -1353,7 +1325,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
13531325
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
13541326
; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
13551327
; TF-FIXEDLEN-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
1356-
; TF-FIXEDLEN-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
1328+
; TF-FIXEDLEN-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
13571329
; TF-FIXEDLEN: middle.block:
13581330
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
13591331
; TF-FIXEDLEN: scalar.ph:
@@ -1371,7 +1343,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
13711343
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
13721344
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
13731345
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
1374-
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
1346+
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
13751347
; TF-FIXEDLEN: for.end:
13761348
; TF-FIXEDLEN-NEXT: ret void
13771349
;
@@ -1539,7 +1511,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
15391511
; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
15401512
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
15411513
; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
1542-
; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
1514+
; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
15431515
; TF-FIXEDLEN: middle.block:
15441516
; TF-FIXEDLEN-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
15451517
; TF-FIXEDLEN: scalar.ph:
@@ -1552,7 +1524,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
15521524
; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
15531525
; TF-FIXEDLEN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
15541526
; TF-FIXEDLEN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
1555-
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
1527+
; TF-FIXEDLEN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
15561528
; TF-FIXEDLEN: for.end:
15571529
; TF-FIXEDLEN-NEXT: ret void
15581530
;

llvm/test/Transforms/LoopVectorize/X86/pr81872.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s
2+
; RUN: opt -S -passes=loop-vectorize -force-vector-width=4 < %s | FileCheck %s
33
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
44
target triple = "x86_64-unknown-linux-gnu"
55

@@ -16,10 +16,10 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
1616
; CHECK-NEXT: bb5:
1717
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
1818
; CHECK: vector.ph:
19-
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
19+
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
2020
; CHECK: vector.body:
21-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
22-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 99, i64 98, i64 97, i64 96>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
21+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ]
22+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 99, i64 98, i64 97, i64 96>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP_HEADER]] ]
2323
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 99, [[INDEX]]
2424
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
2525
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
@@ -38,12 +38,12 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
3838
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
3939
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
4040
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
41-
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
41+
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
4242
; CHECK: middle.block:
4343
; CHECK-NEXT: br i1 true, label [[BB6:%.*]], label [[SCALAR_PH]]
4444
; CHECK: scalar.ph:
4545
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 87, [[MIDDLE_BLOCK]] ], [ 99, [[BB5:%.*]] ]
46-
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
46+
; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]]
4747
; CHECK: loop.header:
4848
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
4949
; CHECK-NEXT: [[AND:%.*]] = and i64 [[IV]], 1
@@ -57,7 +57,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
5757
; CHECK: loop.latch:
5858
; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
5959
; CHECK-NEXT: [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90
60-
; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
60+
; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER1]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
6161
; CHECK: bb6:
6262
; CHECK-NEXT: ret void
6363
;

0 commit comments

Comments
 (0)