Revert "[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (#96752)" #123057

david-arm · 2025-01-15T13:55:41Z

This reverts commit bfedf64.

…eableAndAlignedInLoop (llvm#96752)" This reverts commit bfedf64.

llvmbot · 2025-01-15T13:56:19Z

@llvm/pr-subscribers-llvm-transforms

Author: David Sherwood (david-arm)

Changes

This reverts commit bfedf64.

Patch is 39.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123057.diff

5 Files Affected:

(modified) llvm/include/llvm/Analysis/LoopAccessAnalysis.h (-19)
(modified) llvm/lib/Analysis/Loads.cpp (+52-57)
(modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+35-26)
(modified) llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll (+167-71)
(modified) llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll (+12-15)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 6fc6ca14d08895..31374a128856c7 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -853,25 +853,6 @@ bool sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, const DataLayout &DL,
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                          ScalarEvolution &SE, bool CheckType = true);
 
-/// Calculate Start and End points of memory access.
-/// Let's assume A is the first access and B is a memory access on N-th loop
-/// iteration. Then B is calculated as:
-///   B = A + Step*N .
-/// Step value may be positive or negative.
-/// N is a calculated back-edge taken count:
-///     N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
-/// Start and End points are calculated in the following way:
-/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
-/// where SizeOfElt is the size of single memory access in bytes.
-///
-/// There is no conflict when the intervals are disjoint:
-/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
-std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
-    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount,
-    ScalarEvolution *SE,
-    DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds);
-
 class LoopAccessInfoManager {
   /// The cache.
   DenseMap<Loop *, std::unique_ptr<LoopAccessInfo>> LoopAccessInfoMap;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index cc6760292c2ff1..7bbd469bd035d3 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -276,88 +275,84 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
 bool llvm::isDereferenceableAndAlignedInLoop(
     LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     AssumptionCache *AC, SmallVectorImpl<const SCEVPredicate *> *Predicates) {
-  const Align Alignment = LI->getAlign();
   auto &DL = LI->getDataLayout();
   Value *Ptr = LI->getPointerOperand();
+
   APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
                 DL.getTypeStoreSize(LI->getType()).getFixedValue());
+  const Align Alignment = LI->getAlign();
+
+  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
 
   // If given a uniform (i.e. non-varying) address, see if we can prove the
   // access is safe within the loop w/o needing predication.
   if (L->isLoopInvariant(Ptr))
-    return isDereferenceableAndAlignedPointer(
-        Ptr, Alignment, EltSize, DL, L->getHeader()->getFirstNonPHI(), AC, &DT);
-
-  const SCEV *PtrScev = SE.getSCEV(Ptr);
-  auto *AddRec = dyn_cast<SCEVAddRecExpr>(PtrScev);
+    return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL,
+                                              HeaderFirstNonPHI, AC, &DT);
 
-  // Check to see if we have a repeating access pattern and it's possible
-  // to prove all accesses are well aligned.
+  // Otherwise, check to see if we have a repeating access pattern where we can
+  // prove that all accesses are well aligned and dereferenceable.
+  auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Ptr));
   if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine())
     return false;
-
   auto* Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(SE));
   if (!Step)
     return false;
 
-  // For the moment, restrict ourselves to the case where the access size is a
-  // multiple of the requested alignment and the base is aligned.
-  // TODO: generalize if a case found which warrants
-  if (EltSize.urem(Alignment.value()) != 0)
+  auto TC = SE.getSmallConstantMaxTripCount(L, Predicates);
+  if (!TC)
     return false;
 
   // TODO: Handle overlapping accesses.
-  if (EltSize.ugt(Step->getAPInt().abs()))
-    return false;
-
-  const SCEV *MaxBECount =
-      SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates);
-  if (isa<SCEVCouldNotCompute>(MaxBECount))
-    return false;
-
-  const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
-      L, PtrScev, LI->getType(), MaxBECount, &SE, nullptr);
-  if (isa<SCEVCouldNotCompute>(AccessStart) ||
-      isa<SCEVCouldNotCompute>(AccessEnd))
+  // We should be computing AccessSize as (TC - 1) * Step + EltSize.
+  if (EltSize.sgt(Step->getAPInt()))
     return false;
 
-  // Try to get the access size.
-  const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart);
-  APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff);
+  // Compute the total access size for access patterns with unit stride and
+  // patterns with gaps. For patterns with unit stride, Step and EltSize are the
+  // same.
+  // For patterns with gaps (i.e. non unit stride), we are
+  // accessing EltSize bytes at every Step.
+  APInt AccessSize = TC * Step->getAPInt();
 
+  assert(SE.isLoopInvariant(AddRec->getStart(), L) &&
+         "implied by addrec definition");
   Value *Base = nullptr;
-  APInt AccessSize;
-  if (const SCEVUnknown *NewBase = dyn_cast<SCEVUnknown>(AccessStart)) {
-    Base = NewBase->getValue();
-    AccessSize = MaxPtrDiff;
-  } else if (auto *MinAdd = dyn_cast<SCEVAddExpr>(AccessStart)) {
-    if (MinAdd->getNumOperands() != 2)
-      return false;
-
-    const auto *Offset = dyn_cast<SCEVConstant>(MinAdd->getOperand(0));
-    const auto *NewBase = dyn_cast<SCEVUnknown>(MinAdd->getOperand(1));
-    if (!Offset || !NewBase)
-      return false;
-
-    // The following code below assumes the offset is unsigned, but GEP
-    // offsets are treated as signed so we can end up with a signed value
-    // here too. For example, suppose the initial PHI value is (i8 255),
-    // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
-    if (Offset->getAPInt().isNegative())
-      return false;
+  if (auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart())) {
+    Base = StartS->getValue();
+  } else if (auto *StartS = dyn_cast<SCEVAddExpr>(AddRec->getStart())) {
+    // Handle (NewBase + offset) as start value.
+    const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
+    const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
+    if (StartS->getNumOperands() == 2 && Offset && NewBase) {
+      // The following code below assumes the offset is unsigned, but GEP
+      // offsets are treated as signed so we can end up with a signed value
+      // here too. For example, suppose the initial PHI value is (i8 255),
+      // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
+      if (Offset->getAPInt().isNegative())
+        return false;
 
-    // For the moment, restrict ourselves to the case where the offset is a
-    // multiple of the requested alignment and the base is aligned.
-    // TODO: generalize if a case found which warrants
-    if (Offset->getAPInt().urem(Alignment.value()) != 0)
-      return false;
+      // For the moment, restrict ourselves to the case where the offset is a
+      // multiple of the requested alignment and the base is aligned.
+      // TODO: generalize if a case found which warrants
+      if (Offset->getAPInt().urem(Alignment.value()) != 0)
+        return false;
+      Base = NewBase->getValue();
+      bool Overflow = false;
+      AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
+      if (Overflow)
+        return false;
+    }
+  }
 
-    AccessSize = MaxPtrDiff + Offset->getAPInt();
-    Base = NewBase->getValue();
-  } else
+  if (!Base)
     return false;
 
-  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
+  // For the moment, restrict ourselves to the case where the access size is a
+  // multiple of the requested alignment and the base is aligned.
+  // TODO: generalize if a case found which warrants
+  if (EltSize.urem(Alignment.value()) != 0)
+    return false;
   return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
                                             HeaderFirstNonPHI, AC, &DT);
 }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 11e0a221fc8878..2a68979add666d 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -190,20 +190,31 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
   Members.push_back(Index);
 }
 
-std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
-    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount,
-    ScalarEvolution *SE,
+/// Calculate Start and End points of memory access.
+/// Let's assume A is the first access and B is a memory access on N-th loop
+/// iteration. Then B is calculated as:
+///   B = A + Step*N .
+/// Step value may be positive or negative.
+/// N is a calculated back-edge taken count:
+///     N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
+/// Start and End points are calculated in the following way:
+/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
+/// where SizeOfElt is the size of single memory access in bytes.
+///
+/// There is no conflict when the intervals are disjoint:
+/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
+static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
+    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
+    PredicatedScalarEvolution &PSE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds) {
-  std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
-  if (PointerBounds) {
-    auto [Iter, Ins] = PointerBounds->insert(
-        {{PtrExpr, AccessTy},
-         {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
-    if (!Ins)
-      return Iter->second;
-    PtrBoundsPair = &Iter->second;
-  }
+             std::pair<const SCEV *, const SCEV *>> &PointerBounds) {
+  ScalarEvolution *SE = PSE.getSE();
+
+  auto [Iter, Ins] = PointerBounds.insert(
+      {{PtrExpr, AccessTy},
+       {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
+  if (!Ins)
+    return Iter->second;
 
   const SCEV *ScStart;
   const SCEV *ScEnd;
@@ -211,8 +222,10 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
   if (SE->isLoopInvariant(PtrExpr, Lp)) {
     ScStart = ScEnd = PtrExpr;
   } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) {
+    const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount();
+
     ScStart = AR->getStart();
-    ScEnd = AR->evaluateAtIteration(MaxBECount, *SE);
+    ScEnd = AR->evaluateAtIteration(Ex, *SE);
     const SCEV *Step = AR->getStepRecurrence(*SE);
 
     // For expressions with negative step, the upper bound is ScStart and the
@@ -231,7 +244,7 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
     return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
 
   assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
-  assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
+  assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
 
   // Add the size of the pointed element to ScEnd.
   auto &DL = Lp->getHeader()->getDataLayout();
@@ -239,10 +252,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
   const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
   ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
 
-  std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
-  if (PointerBounds)
-    *PtrBoundsPair = Res;
-  return Res;
+  Iter->second = {ScStart, ScEnd};
+  return Iter->second;
 }
 
 /// Calculate Start and End points of memory access using
@@ -252,9 +263,8 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
                                     unsigned DepSetId, unsigned ASId,
                                     PredicatedScalarEvolution &PSE,
                                     bool NeedsFreeze) {
-  const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount();
   const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
-      Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds());
+      Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds());
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -1928,11 +1938,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   // required for correctness.
   if (SE.isLoopInvariant(Src, InnermostLoop) ||
       SE.isLoopInvariant(Sink, InnermostLoop)) {
-    const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount();
-    const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(
-        InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds);
-    const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(
-        InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds);
+    const auto &[SrcStart_, SrcEnd_] =
+        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds);
+    const auto &[SinkStart_, SinkEnd_] =
+        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds);
     if (!isa<SCEVCouldNotCompute>(SrcStart_) &&
         !isa<SCEVCouldNotCompute>(SrcEnd_) &&
         !isa<SCEVCouldNotCompute>(SinkStart_) &&
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 3e50ee42866b9b..1433e48690bc60 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -2920,8 +2920,8 @@ loop_exit:
   ret i32 %accum.next
 }
 
-define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
-; CHECK-LABEL: @test_non_unit_stride_off_by_four_bytes(
+define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
+; CHECK-LABEL: @neg_test_non_unit_stride_off_by_four_bytes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [103 x i32], align 4
 ; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
@@ -2929,11 +2929,11 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP112:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP114:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP115:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[PRED_LOAD_CONTINUE33]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
@@ -2999,74 +2999,170 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4
-; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
-; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4
-; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4
-; CHECK-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4
-; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4
-; CHECK-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
-; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4
-; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
-; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4
-; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4
-; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
-; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3
-; CHECK-NEXT: ...
[truncated]

llvmbot · 2025-01-15T13:56:20Z

@llvm/pr-subscribers-llvm-analysis

Author: David Sherwood (david-arm)

Changes

This reverts commit bfedf64.

Patch is 39.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123057.diff

5 Files Affected:

(modified) llvm/include/llvm/Analysis/LoopAccessAnalysis.h (-19)
(modified) llvm/lib/Analysis/Loads.cpp (+52-57)
(modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+35-26)
(modified) llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll (+167-71)
(modified) llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll (+12-15)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 6fc6ca14d08895..31374a128856c7 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -853,25 +853,6 @@ bool sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, const DataLayout &DL,
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                          ScalarEvolution &SE, bool CheckType = true);
 
-/// Calculate Start and End points of memory access.
-/// Let's assume A is the first access and B is a memory access on N-th loop
-/// iteration. Then B is calculated as:
-///   B = A + Step*N .
-/// Step value may be positive or negative.
-/// N is a calculated back-edge taken count:
-///     N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
-/// Start and End points are calculated in the following way:
-/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
-/// where SizeOfElt is the size of single memory access in bytes.
-///
-/// There is no conflict when the intervals are disjoint:
-/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
-std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
-    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount,
-    ScalarEvolution *SE,
-    DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds);
-
 class LoopAccessInfoManager {
   /// The cache.
   DenseMap<Loop *, std::unique_ptr<LoopAccessInfo>> LoopAccessInfoMap;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index cc6760292c2ff1..7bbd469bd035d3 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -276,88 +275,84 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
 bool llvm::isDereferenceableAndAlignedInLoop(
     LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     AssumptionCache *AC, SmallVectorImpl<const SCEVPredicate *> *Predicates) {
-  const Align Alignment = LI->getAlign();
   auto &DL = LI->getDataLayout();
   Value *Ptr = LI->getPointerOperand();
+
   APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
                 DL.getTypeStoreSize(LI->getType()).getFixedValue());
+  const Align Alignment = LI->getAlign();
+
+  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
 
   // If given a uniform (i.e. non-varying) address, see if we can prove the
   // access is safe within the loop w/o needing predication.
   if (L->isLoopInvariant(Ptr))
-    return isDereferenceableAndAlignedPointer(
-        Ptr, Alignment, EltSize, DL, L->getHeader()->getFirstNonPHI(), AC, &DT);
-
-  const SCEV *PtrScev = SE.getSCEV(Ptr);
-  auto *AddRec = dyn_cast<SCEVAddRecExpr>(PtrScev);
+    return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL,
+                                              HeaderFirstNonPHI, AC, &DT);
 
-  // Check to see if we have a repeating access pattern and it's possible
-  // to prove all accesses are well aligned.
+  // Otherwise, check to see if we have a repeating access pattern where we can
+  // prove that all accesses are well aligned and dereferenceable.
+  auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Ptr));
   if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine())
     return false;
-
   auto* Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(SE));
   if (!Step)
     return false;
 
-  // For the moment, restrict ourselves to the case where the access size is a
-  // multiple of the requested alignment and the base is aligned.
-  // TODO: generalize if a case found which warrants
-  if (EltSize.urem(Alignment.value()) != 0)
+  auto TC = SE.getSmallConstantMaxTripCount(L, Predicates);
+  if (!TC)
     return false;
 
   // TODO: Handle overlapping accesses.
-  if (EltSize.ugt(Step->getAPInt().abs()))
-    return false;
-
-  const SCEV *MaxBECount =
-      SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates);
-  if (isa<SCEVCouldNotCompute>(MaxBECount))
-    return false;
-
-  const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
-      L, PtrScev, LI->getType(), MaxBECount, &SE, nullptr);
-  if (isa<SCEVCouldNotCompute>(AccessStart) ||
-      isa<SCEVCouldNotCompute>(AccessEnd))
+  // We should be computing AccessSize as (TC - 1) * Step + EltSize.
+  if (EltSize.sgt(Step->getAPInt()))
     return false;
 
-  // Try to get the access size.
-  const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart);
-  APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff);
+  // Compute the total access size for access patterns with unit stride and
+  // patterns with gaps. For patterns with unit stride, Step and EltSize are the
+  // same.
+  // For patterns with gaps (i.e. non unit stride), we are
+  // accessing EltSize bytes at every Step.
+  APInt AccessSize = TC * Step->getAPInt();
 
+  assert(SE.isLoopInvariant(AddRec->getStart(), L) &&
+         "implied by addrec definition");
   Value *Base = nullptr;
-  APInt AccessSize;
-  if (const SCEVUnknown *NewBase = dyn_cast<SCEVUnknown>(AccessStart)) {
-    Base = NewBase->getValue();
-    AccessSize = MaxPtrDiff;
-  } else if (auto *MinAdd = dyn_cast<SCEVAddExpr>(AccessStart)) {
-    if (MinAdd->getNumOperands() != 2)
-      return false;
-
-    const auto *Offset = dyn_cast<SCEVConstant>(MinAdd->getOperand(0));
-    const auto *NewBase = dyn_cast<SCEVUnknown>(MinAdd->getOperand(1));
-    if (!Offset || !NewBase)
-      return false;
-
-    // The following code below assumes the offset is unsigned, but GEP
-    // offsets are treated as signed so we can end up with a signed value
-    // here too. For example, suppose the initial PHI value is (i8 255),
-    // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
-    if (Offset->getAPInt().isNegative())
-      return false;
+  if (auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart())) {
+    Base = StartS->getValue();
+  } else if (auto *StartS = dyn_cast<SCEVAddExpr>(AddRec->getStart())) {
+    // Handle (NewBase + offset) as start value.
+    const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
+    const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
+    if (StartS->getNumOperands() == 2 && Offset && NewBase) {
+      // The following code below assumes the offset is unsigned, but GEP
+      // offsets are treated as signed so we can end up with a signed value
+      // here too. For example, suppose the initial PHI value is (i8 255),
+      // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
+      if (Offset->getAPInt().isNegative())
+        return false;
 
-    // For the moment, restrict ourselves to the case where the offset is a
-    // multiple of the requested alignment and the base is aligned.
-    // TODO: generalize if a case found which warrants
-    if (Offset->getAPInt().urem(Alignment.value()) != 0)
-      return false;
+      // For the moment, restrict ourselves to the case where the offset is a
+      // multiple of the requested alignment and the base is aligned.
+      // TODO: generalize if a case found which warrants
+      if (Offset->getAPInt().urem(Alignment.value()) != 0)
+        return false;
+      Base = NewBase->getValue();
+      bool Overflow = false;
+      AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
+      if (Overflow)
+        return false;
+    }
+  }
 
-    AccessSize = MaxPtrDiff + Offset->getAPInt();
-    Base = NewBase->getValue();
-  } else
+  if (!Base)
     return false;
 
-  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
+  // For the moment, restrict ourselves to the case where the access size is a
+  // multiple of the requested alignment and the base is aligned.
+  // TODO: generalize if a case found which warrants
+  if (EltSize.urem(Alignment.value()) != 0)
+    return false;
   return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
                                             HeaderFirstNonPHI, AC, &DT);
 }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 11e0a221fc8878..2a68979add666d 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -190,20 +190,31 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
   Members.push_back(Index);
 }
 
-std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
-    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount,
-    ScalarEvolution *SE,
+/// Calculate Start and End points of memory access.
+/// Let's assume A is the first access and B is a memory access on N-th loop
+/// iteration. Then B is calculated as:
+///   B = A + Step*N .
+/// Step value may be positive or negative.
+/// N is a calculated back-edge taken count:
+///     N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
+/// Start and End points are calculated in the following way:
+/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
+/// where SizeOfElt is the size of single memory access in bytes.
+///
+/// There is no conflict when the intervals are disjoint:
+/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
+static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
+    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
+    PredicatedScalarEvolution &PSE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds) {
-  std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
-  if (PointerBounds) {
-    auto [Iter, Ins] = PointerBounds->insert(
-        {{PtrExpr, AccessTy},
-         {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
-    if (!Ins)
-      return Iter->second;
-    PtrBoundsPair = &Iter->second;
-  }
+             std::pair<const SCEV *, const SCEV *>> &PointerBounds) {
+  ScalarEvolution *SE = PSE.getSE();
+
+  auto [Iter, Ins] = PointerBounds.insert(
+      {{PtrExpr, AccessTy},
+       {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
+  if (!Ins)
+    return Iter->second;
 
   const SCEV *ScStart;
   const SCEV *ScEnd;
@@ -211,8 +222,10 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
   if (SE->isLoopInvariant(PtrExpr, Lp)) {
     ScStart = ScEnd = PtrExpr;
   } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) {
+    const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount();
+
     ScStart = AR->getStart();
-    ScEnd = AR->evaluateAtIteration(MaxBECount, *SE);
+    ScEnd = AR->evaluateAtIteration(Ex, *SE);
     const SCEV *Step = AR->getStepRecurrence(*SE);
 
     // For expressions with negative step, the upper bound is ScStart and the
@@ -231,7 +244,7 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
     return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
 
   assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
-  assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
+  assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
 
   // Add the size of the pointed element to ScEnd.
   auto &DL = Lp->getHeader()->getDataLayout();
@@ -239,10 +252,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
   const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
   ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
 
-  std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
-  if (PointerBounds)
-    *PtrBoundsPair = Res;
-  return Res;
+  Iter->second = {ScStart, ScEnd};
+  return Iter->second;
 }
 
 /// Calculate Start and End points of memory access using
@@ -252,9 +263,8 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
                                     unsigned DepSetId, unsigned ASId,
                                     PredicatedScalarEvolution &PSE,
                                     bool NeedsFreeze) {
-  const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount();
   const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
-      Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds());
+      Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds());
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -1928,11 +1938,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   // required for correctness.
   if (SE.isLoopInvariant(Src, InnermostLoop) ||
       SE.isLoopInvariant(Sink, InnermostLoop)) {
-    const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount();
-    const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(
-        InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds);
-    const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(
-        InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds);
+    const auto &[SrcStart_, SrcEnd_] =
+        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds);
+    const auto &[SinkStart_, SinkEnd_] =
+        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds);
     if (!isa<SCEVCouldNotCompute>(SrcStart_) &&
         !isa<SCEVCouldNotCompute>(SrcEnd_) &&
         !isa<SCEVCouldNotCompute>(SinkStart_) &&
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 3e50ee42866b9b..1433e48690bc60 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -2920,8 +2920,8 @@ loop_exit:
   ret i32 %accum.next
 }
 
-define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
-; CHECK-LABEL: @test_non_unit_stride_off_by_four_bytes(
+define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
+; CHECK-LABEL: @neg_test_non_unit_stride_off_by_four_bytes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [103 x i32], align 4
 ; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
@@ -2929,11 +2929,11 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP112:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP114:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP115:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[PRED_LOAD_CONTINUE33]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
@@ -2999,74 +2999,170 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4
-; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
-; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
-; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4
-; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4
-; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
-; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4
-; CHECK-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4
-; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4
-; CHECK-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
-; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4
-; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
-; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4
-; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4
-; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
-; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3
-; CHECK-NEXT: ...
[truncated]

fhahn · 2025-01-15T13:57:26Z

It would be good to include a reason for the revert in the future in the commit message

github-actions · 2025-01-15T13:59:06Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff d1314d0152f242c618caafce264fccbc47273d84 6c7f61f885832eecdd6bd95cc10ed188f958ca9d --extensions cpp,h -- llvm/include/llvm/Analysis/LoopAccessAnalysis.h llvm/lib/Analysis/Loads.cpp llvm/lib/Analysis/LoopAccessAnalysis.cpp

View the diff from clang-format here.

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2a68979add..6cc3549a24 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -244,7 +244,7 @@ static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
     return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
 
   assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
-  assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
+  assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
 
   // Add the size of the pointed element to ScEnd.
   auto &DL = Lp->getHeader()->getDataLayout();

david-arm · 2025-01-15T16:05:36Z

It would be good to include a reason for the revert in the future in the commit message

Yep good point. I guess I was just panicking as I'd broken the build and wanted to get a fix up asap, but you're right in future I'll make sure to do that. 👍

Revert "[LoopVectorize] Add support for reverse loops in isDereferenc…

6c7f61f

…eableAndAlignedInLoop (llvm#96752)" This reverts commit bfedf64.

llvmbot added llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Jan 15, 2025

david-arm merged commit a00938e into llvm:main Jan 15, 2025
7 of 9 checks passed

david-arm deleted the revert_pr96752 branch January 28, 2025 11:47

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Revert "[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (#96752)" #123057

Revert "[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (#96752)" #123057

Uh oh!

david-arm commented Jan 15, 2025

Uh oh!

llvmbot commented Jan 15, 2025

Uh oh!

llvmbot commented Jan 15, 2025

Uh oh!

Uh oh!

fhahn commented Jan 15, 2025

Uh oh!

github-actions bot commented Jan 15, 2025

Uh oh!

david-arm commented Jan 15, 2025

Uh oh!

Uh oh!

Revert "[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (#96752)" #123057

Revert "[LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (#96752)" #123057

Uh oh!

Conversation

david-arm commented Jan 15, 2025

Uh oh!

llvmbot commented Jan 15, 2025

Uh oh!

llvmbot commented Jan 15, 2025

Uh oh!

Uh oh!

fhahn commented Jan 15, 2025

Uh oh!

github-actions bot commented Jan 15, 2025

Uh oh!

david-arm commented Jan 15, 2025

Uh oh!

Uh oh!