Skip to content

Commit 166f376

Browse files
committed
Reduce cost only when not tail-folding
1 parent 15d21df commit 166f376

File tree

3 files changed

+44
-22
lines changed

3 files changed

+44
-22
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5581,7 +5581,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
55815581
// away.
55825582
SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
55835583
auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5584-
if (VF.isFixed() && TC == VF.getFixedValue())
5584+
if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
55855585
addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
55865586
ValuesToIgnoreForVF);
55875587

@@ -7281,7 +7281,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
72817281
// TODO: Remove this code after stepping away from the legacy cost model and
72827282
// adding code to simplify VPlans before calculating their costs.
72837283
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
7284-
if (VF.isFixed() && TC == VF.getFixedValue())
7284+
if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
72857285
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
72867286
CostCtx.SkipCostComputation);
72877287

llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -185,26 +185,34 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
185185
; CHECK-NEXT: entry:
186186
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
187187
; CHECK: vector.ph:
188+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
189+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
190+
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
191+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
192+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
193+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
194+
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
195+
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
188196
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
189197
; CHECK: vector.body:
190198
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
191199
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
192-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[TMP7]], i64 8)
200+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 8)
193201
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
194202
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
195-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP9]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
196-
; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
203+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
204+
; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
197205
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
198206
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
199-
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP12]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
200-
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_MASKED_LOAD1]]
201-
; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP6]], ptr [[TMP12]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
202-
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
207+
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
208+
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
209+
; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
210+
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
203211
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
204212
; CHECK: middle.block:
205213
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
206214
; CHECK: scalar.ph:
207-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
215+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
208216
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
209217
; CHECK: for.body:
210218
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]

llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,28 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
88
; CHECK: vector.ph:
9+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
10+
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], 1
11+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP1]]
12+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]]
13+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
14+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
915
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1016
; CHECK: vector.body:
1117
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1218
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
13-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP3]], i32 4)
19+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4)
1420
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
1521
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
16-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
17-
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
18-
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
19-
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
22+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
23+
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
24+
; CHECK-NEXT: call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
25+
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP2]]
2026
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
2127
; CHECK: middle.block:
2228
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
2329
; CHECK: scalar.ph:
24-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
30+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2531
; CHECK-NEXT: br label [[LOOP:%.*]]
2632
; CHECK: loop:
2733
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -57,22 +63,30 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
5763
; CHECK-NEXT: entry:
5864
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
5965
; CHECK: vector.ph:
66+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
67+
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
68+
; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 1
69+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP2]]
70+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
71+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
72+
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
73+
; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4
6074
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
6175
; CHECK: vector.body:
6276
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
6377
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
64-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP5]], i32 4)
78+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
6579
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
6680
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
67-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
68-
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
69-
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP3]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
70-
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
81+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
82+
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
83+
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
84+
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
7185
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
7286
; CHECK: middle.block:
7387
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
7488
; CHECK: scalar.ph:
75-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
89+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
7690
; CHECK-NEXT: br label [[LOOP:%.*]]
7791
; CHECK: loop:
7892
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]

0 commit comments

Comments
 (0)