Skip to content

Commit 6716e78

Browse files
author
Sjoerd Meijer
committed
[ARM][MVE] tail-predication: overflow checks for backedge taken count.
This pick ups the work on the overflow checks for get.active.lane.mask, which ensure that it is safe to insert the VCTP intrinisc that enables tail-predication. For a 2d auto-correlation kernel and its inner loop j: M = Size - i; for (j = 0; j < M; j++) Sum += Input[j] * Input[j+i]; For this inner loop, the SCEV backedge taken count (BTC) expression is: (-1 + (sext i16 %Size to i32)),+,-1}<nw><%for.body> and LoopUtil cannotBeMaxInLoop couldn't calculate a bound on this, thus "BTC cannot be max" could not be determined. So overflow behaviour had to be assumed in the loop tripcount expression that uses the BTC. As a result tail-predication had to be forced (with an option) for this case. This change solves that by using ScalarEvolution's helper getConstantMaxBackedgeTakenCount which is able to determine the range of BTC, thus can determine it is safe, so that we no longer need to force tail-predication as reflected in the changed test cases. Differential Revision: https://reviews.llvm.org/D85737
1 parent ac37afa commit 6716e78

File tree

3 files changed

+88
-270
lines changed

3 files changed

+88
-270
lines changed

llvm/lib/Target/ARM/MVETailPredication.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -362,20 +362,27 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
362362
bool ForceTailPredication =
363363
EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
364364
EnableTailPredication == TailPredication::ForceEnabled;
365+
365366
// 1) Test whether entry to the loop is protected by a conditional
366367
// BTC + 1 < 0. In other words, if the scalar trip count overflows,
367368
// becomes negative, we shouldn't enter the loop and creating
368369
// tripcount expression BTC + 1 is not safe. So, check that BTC
369370
// isn't max. This is evaluated in unsigned, because the semantics
370371
// of @get.active.lane.mask is a ULE comparison.
371-
372-
int VectorWidth = VecTy->getNumElements();
373372
auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
374373
auto *BTC = SE->getSCEV(BackedgeTakenCount);
374+
auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L);
375+
376+
if (isa<SCEVCouldNotCompute>(MaxBTC)) {
377+
LLVM_DEBUG(dbgs() << "ARM TP: Can't compute SCEV BTC expression: ";
378+
BTC->dump());
379+
return false;
380+
}
375381

376-
if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) &&
382+
APInt MaxInt = APInt(BTC->getType()->getScalarSizeInBits(), ~0);
383+
if (cast<SCEVConstant>(MaxBTC)->getAPInt().eq(MaxInt) &&
377384
!ForceTailPredication) {
378-
LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: ";
385+
LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be int max: ";
379386
BTC->dump());
380387
return false;
381388
}
@@ -397,14 +404,15 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
397404
//
398405
auto *TC = SE->getSCEV(TripCount);
399406
unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
407+
int VectorWidth = VecTy->getNumElements();
400408
auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
401409
uint64_t MaxMinusVW = Diff.getZExtValue();
402410
uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
403411

404412
if (UpperboundTC > MaxMinusVW && !ForceTailPredication) {
405413
LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
406414
dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
407-
dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";);
415+
dbgs() << UpperboundTC << " <= " << MaxMinusVW << " == false\n";);
408416
return false;
409417
}
410418

@@ -453,10 +461,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
453461
return false;
454462
}
455463

456-
// 3) Find out if IV is an induction phi. Note that We can't use Loop
464+
// 3) Find out if IV is an induction phi. Note that we can't use Loop
457465
// helpers here to get the induction variable, because the hardware loop is
458-
// no longer in loopsimplify form, and also the hwloop intrinsic use a
459-
// different counter. Using SCEV, we check that the induction is of the
466+
// no longer in loopsimplify form, and also the hwloop intrinsic uses a
467+
// different counter. Using SCEV, we check that the induction is of the
460468
// form i = i + 4, where the increment must be equal to the VectorWidth.
461469
auto *IV = ActiveLaneMask->getOperand(0);
462470
auto *IVExpr = SE->getSCEV(IV);

llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
2-
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=force-enabled \
3-
; RUN: -mattr=+mve %s -S -o - | FileCheck %s --check-prefix=FORCE
42

53
; CHECK-LABEL: reduction_i32
64
; CHECK: phi i32 [ 0, %vector.ph ]
@@ -136,16 +134,15 @@ for.cond.cleanup:
136134
ret i16 %res.0
137135
}
138136

139-
; The vector loop is not guarded with an entry check (N == 0).
140-
; This means we can't calculate a precise range for the backedge count in
141-
; @llvm.get.active.lane.mask, and are assuming overflow can happen and thus
142-
; we can't insert the VCTP here.
137+
; The vector loop is not guarded with an entry check (N == 0). Check that
138+
; despite this we can still calculate a precise enough range for the
139+
; backedge count to safely insert a vctp here.
143140
;
144141
; CHECK-LABEL: @reduction_not_guarded
145142
;
146143
; CHECK: vector.body:
147-
; CHECK-NOT: @llvm.arm.mve.vctp
148-
; CHECK: @llvm.get.active.lane.mask.v8i1.i32
144+
; CHECK: @llvm.arm.mve.vctp
145+
; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32
149146
; CHECK: ret
150147
;
151148
define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
@@ -196,24 +193,10 @@ middle.block: ; preds = %vector.body
196193
ret i16 %tmp9
197194
}
198195

199-
; Without forcing tail-predication, we bail because overflow analysis says:
200-
;
201-
; overflow possible in: {(-1 + (sext i16 %Size to i32)),+,-1}<nw><%for.body>
202-
;
203196
; CHECK-LABEL: @Correlation
204-
;
205-
; CHECK: vector.body:
206-
; CHECK-NOT: @llvm.arm.mve.vctp
207-
; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
208-
;
209-
; FORCE-LABEL: @Correlation
210-
; FORCE: vector.ph: ; preds = %for.body
211-
; FORCE: %trip.count.minus.1 = add i32 %{{.*}}, -1
212-
; FORCE: call void @llvm.set.loop.iterations.i32(i32 %{{.*}})
213-
; FORCE: br label %vector.body
214-
; FORCE: vector.body: ; preds = %vector.body, %vector.ph
215-
; FORCE: %[[VCTP:.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 %{{.*}})
216-
; FORCE: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[VCTP]]{{.*}}
197+
; CHECK: vector.body:
198+
; CHECK: @llvm.arm.mve.vctp
199+
; CHECK-NOT: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask
217200
;
218201
define dso_local void @Correlation(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
219202
entry:

0 commit comments

Comments
 (0)