[ARM][MVE] tail-predication: overflow checks for backedge taken count.

Sjoerd Meijer · Sjoerd Meijer · commit 6716e7868ec3 · 2020-08-12T09:32:26.000+01:00
This pick ups the work on the overflow checks for get.active.lane.mask, which ensure that it is safe to insert the VCTP intrinisc that enables tail-predication. For a 2d auto-correlation kernel and its inner loop j: M = Size - i; for (j = 0; j < M; j++) Sum += Input[j] * Input[j+i]; For this inner loop, the SCEV backedge taken count (BTC) expression is: (-1 + (sext i16 %Size to i32)),+,-1}<nw><%for.body> and LoopUtil cannotBeMaxInLoop couldn't calculate a bound on this, thus "BTC cannot be max" could not be determined. So overflow behaviour had to be assumed in the loop tripcount expression that uses the BTC. As a result tail-predication had to be forced (with an option) for this case. This change solves that by using ScalarEvolution's helper getConstantMaxBackedgeTakenCount which is able to determine the range of BTC, thus can determine it is safe, so that we no longer need to force tail-predication as reflected in the changed test cases. Differential Revision: https://reviews.llvm.org/D85737
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -362,20 +362,27 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
   bool ForceTailPredication =
     EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
     EnableTailPredication == TailPredication::ForceEnabled;
+
   // 1) Test whether entry to the loop is protected by a conditional
   // BTC + 1 < 0. In other words, if the scalar trip count overflows,
   // becomes negative, we shouldn't enter the loop and creating
   // tripcount expression BTC + 1 is not safe. So, check that BTC
   // isn't max. This is evaluated in unsigned, because the semantics
   // of @get.active.lane.mask is a ULE comparison.
-
-  int VectorWidth = VecTy->getNumElements();
   auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
   auto *BTC = SE->getSCEV(BackedgeTakenCount);
+  auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L);
+
+  if (isa<SCEVCouldNotCompute>(MaxBTC)) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Can't compute SCEV BTC expression: ";
+               BTC->dump());
+    return false;
+  }
 
-  if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) &&
+  APInt MaxInt = APInt(BTC->getType()->getScalarSizeInBits(), ~0);
+  if (cast<SCEVConstant>(MaxBTC)->getAPInt().eq(MaxInt) &&
       !ForceTailPredication) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: ";
+    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be int max: ";
                BTC->dump());
     return false;
   }
@@ -397,14 +404,15 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
   //
   auto *TC = SE->getSCEV(TripCount);
   unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
+  int VectorWidth = VecTy->getNumElements();
   auto Diff =  APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
   uint64_t MaxMinusVW = Diff.getZExtValue();
   uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
 
   if (UpperboundTC > MaxMinusVW && !ForceTailPredication) {
     LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
                dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
-               dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";);
+               dbgs() << UpperboundTC << " <= " << MaxMinusVW << " == false\n";);
     return false;
   }
 
@@ -453,10 +461,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
     return false;
   }
 
-  // 3) Find out if IV is an induction phi. Note that We can't use Loop
+  // 3) Find out if IV is an induction phi. Note that we can't use Loop
   // helpers here to get the induction variable, because the hardware loop is
-  // no longer in loopsimplify form, and also the hwloop intrinsic use a
-  // different counter.  Using SCEV, we check that the induction is of the
+  // no longer in loopsimplify form, and also the hwloop intrinsic uses a
+  // different counter. Using SCEV, we check that the induction is of the
   // form i = i + 4, where the increment must be equal to the VectorWidth.
   auto *IV = ActiveLaneMask->getOperand(0);
   auto *IVExpr = SE->getSCEV(IV);
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -1,6 +1,4 @@
 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
-; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=force-enabled \
-; RUN:    -mattr=+mve %s -S -o - | FileCheck %s --check-prefix=FORCE
 
 ; CHECK-LABEL: reduction_i32
 ; CHECK: phi i32 [ 0, %vector.ph ]
@@ -136,16 +134,15 @@ for.cond.cleanup:
   ret i16 %res.0
 }
 
-; The vector loop is not guarded with an entry check (N == 0).
-; This means we can't calculate a precise range for the backedge count in
-; @llvm.get.active.lane.mask, and are assuming overflow can happen and thus
-; we can't insert the VCTP here.
+; The vector loop is not guarded with an entry check (N == 0). Check that
+; despite this we can still calculate a precise enough range for the
+; backedge count to safely insert a vctp here.
 ;
 ; CHECK-LABEL: @reduction_not_guarded
 ;
 ; CHECK:     vector.body:
-; CHECK-NOT: @llvm.arm.mve.vctp
-; CHECK:     @llvm.get.active.lane.mask.v8i1.i32
+; CHECK:     @llvm.arm.mve.vctp
+; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32
 ; CHECK:     ret
 ;
 define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
@@ -196,24 +193,10 @@ middle.block:                                     ; preds = %vector.body
   ret i16 %tmp9
 }
 
-; Without forcing tail-predication, we bail because overflow analysis says:
-;
-;   overflow possible in: {(-1 + (sext i16 %Size to i32)),+,-1}<nw><%for.body>
-;
 ; CHECK-LABEL: @Correlation
-;
-; CHECK: vector.body:
-; CHECK-NOT: @llvm.arm.mve.vctp
-; CHECK:     %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
-;
-; FORCE-LABEL: @Correlation
-; FORCE: vector.ph:                                        ; preds = %for.body
-; FORCE:   %trip.count.minus.1 = add i32 %{{.*}}, -1
-; FORCE:   call void @llvm.set.loop.iterations.i32(i32 %{{.*}})
-; FORCE:   br label %vector.body
-; FORCE: vector.body:                                      ; preds = %vector.body, %vector.ph
-; FORCE:   %[[VCTP:.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 %{{.*}})
-; FORCE:   call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[VCTP]]{{.*}}
+; CHECK:       vector.body:
+; CHECK:       @llvm.arm.mve.vctp
+; CHECK-NOT:   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask
 ;
 define dso_local void @Correlation(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll