-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LAA] Be more careful when evaluating AddRecs at symbolic max BTC. #128061
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
49704a2
4c5d9ed
198b3b1
92d8bd6
2636025
41f58ca
fc91973
3764f69
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -188,9 +188,90 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( | |
Members.push_back(Index); | ||
} | ||
|
||
/// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise | ||
/// return nullptr. \p A and \p B must have the same type. | ||
static const SCEV *addSCEVOverflow(const SCEV *A, const SCEV *B, | ||
ScalarEvolution &SE) { | ||
if (!SE.willNotOverflow(Instruction::Add, false, A, B)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add |
||
return nullptr; | ||
return SE.getAddExpr(A, B); | ||
} | ||
|
||
/// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise | ||
/// return nullptr. \p A and \p B must have the same type. | ||
static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, | ||
ScalarEvolution &SE) { | ||
if (!SE.willNotOverflow(Instruction::Mul, false, A, B)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps worth adding something in the comments for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a comment, thanks. They must have the same type for A * B to be valid. |
||
return nullptr; | ||
return SE.getMulExpr(A, B); | ||
} | ||
|
||
/// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at | ||
/// \p MaxBTC is guaranteed inbounds of the accessed object. | ||
static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, | ||
const SCEV *MaxBTC, | ||
const SCEV *EltSize, | ||
ScalarEvolution &SE, | ||
const DataLayout &DL) { | ||
auto *PointerBase = SE.getPointerBase(AR->getStart()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This assumes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep it must be a pointer AddRec, and that's guaranteed from the current caller. Updated to |
||
auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase); | ||
if (!StartPtr) | ||
return false; | ||
bool CheckForNonNull, CheckForFreed; | ||
uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes( | ||
DL, CheckForNonNull, CheckForFreed); | ||
|
||
if (CheckForNonNull || CheckForFreed) | ||
return false; | ||
|
||
const SCEV *Step = AR->getStepRecurrence(SE); | ||
bool IsKnownNonNegative = SE.isKnownNonNegative(Step); | ||
if (!IsKnownNonNegative && !SE.isKnownNegative(Step)) | ||
return false; | ||
|
||
Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType()); | ||
Step = SE.getNoopOrSignExtend(Step, WiderTy); | ||
MaxBTC = SE.getNoopOrZeroExtend(MaxBTC, WiderTy); | ||
|
||
// For the computations below, make sure they don't unsigned wrap. | ||
if (!SE.isKnownPredicate(CmpInst::ICMP_UGE, AR->getStart(), StartPtr)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a hard theoretical limitation that prevents AR being something like (%p - 16)? Or does it just make the later calculations simpler? If it's the latter (sounds like it), then can you add a TODO that we can improve this in future? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This currently makes sure that |
||
return false; | ||
const SCEV *StartOffset = SE.getNoopOrZeroExtend( | ||
SE.getMinusSCEV(AR->getStart(), StartPtr), WiderTy); | ||
|
||
const SCEV *OffsetAtLastIter = | ||
mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, false), SE); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
if (!OffsetAtLastIter) | ||
return false; | ||
|
||
const SCEV *OffsetEndBytes = addSCEVOverflow( | ||
OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this isn't really the last accessed byte, which would really be I realise it makes the subsequent calculations simpler, but it does artificially increase the chance of overflow because in reality the pointer for the last accessed byte is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the names, thanks.
We are comparing the result |
||
if (!OffsetEndBytes) | ||
return false; | ||
|
||
if (IsKnownNonNegative) { | ||
// For positive steps, check if | ||
// (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes, | ||
// while making sure none of the computations unsigned wrap themselves. | ||
const SCEV *EndBytes = addSCEVOverflow(StartOffset, OffsetEndBytes, SE); | ||
if (!EndBytes) | ||
return false; | ||
return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, | ||
SE.getConstant(WiderTy, DerefBytes)); | ||
} | ||
|
||
// For negative steps check if | ||
// * StartOffset >= (MaxBTC * Step + EltSize) | ||
// * StartOffset <= DerefBytes. | ||
assert(SE.isKnownNegative(Step) && "must be known negative"); | ||
return SE.isKnownPredicate(CmpInst::ICMP_SGE, StartOffset, OffsetEndBytes) && | ||
SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, | ||
SE.getConstant(WiderTy, DerefBytes)); | ||
} | ||
|
||
std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( | ||
const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, | ||
ScalarEvolution *SE, | ||
const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC, | ||
const SCEV *MaxBTC, ScalarEvolution *SE, | ||
DenseMap<std::pair<const SCEV *, Type *>, | ||
std::pair<const SCEV *, const SCEV *>> *PointerBounds) { | ||
std::pair<const SCEV *, const SCEV *> *PtrBoundsPair; | ||
|
@@ -206,11 +287,37 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( | |
const SCEV *ScStart; | ||
const SCEV *ScEnd; | ||
|
||
auto &DL = Lp->getHeader()->getDataLayout(); | ||
Type *IdxTy = DL.getIndexType(PtrExpr->getType()); | ||
const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy); | ||
if (SE->isLoopInvariant(PtrExpr, Lp)) { | ||
ScStart = ScEnd = PtrExpr; | ||
} else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) { | ||
ScStart = AR->getStart(); | ||
ScEnd = AR->evaluateAtIteration(MaxBECount, *SE); | ||
if (!isa<SCEVCouldNotCompute>(BTC)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure why we are passing the exact BTC, and handling the case where it is a could-not-compute. Why not just pass the symbolic max as before, and have the logic below? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we can compute the back edge taken count, we are guaranteed to execute exactly that amount of iterations. the symbolic max back edge taken count is an upper bound and the loop may exit at any earlier iteration (eg because it has an uncountable exit). As per the comment, computable BTC means we should be able to rely on the fact that the pointers cannot wrap in any iteration. If we instead only have symbolic mac BTC, we may only execute a smaller number of iterations than the max, and then only those iterations are guaranteed to not wrap in general, so evaluating at the symbolic max may wrap. One case to consider is when the symbolic max BTC is a SCEVUnknown, we will form a SCEvMultiply expression for which we cannot determine if it wraps or not (vs the case when the symbolic BTC is a constant) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the explanation. My confusion is the following: if we have a computable BTC, isn't Exact = SymbolicMax? If we don't have a computable BTC, Exact = SCEVCouldNotCompute and SymbolicMax could be a SCEVConstant, general SCEV expression, SCEVUnknown, or SCEVCouldNotCompute, in the worst case. If my reasoning is correct, there is no additional information in Exact over the SymbolicMax, and we shouldn't have to pass Exact. In the test cases you have added, isn't SymbolicMax a SCEVConstant = INT_MAX? What does evaluating an AddRec at the INT_MAX iteration wrap to? Not -(EltSize + 1), or evaluating the AddRec at INT_MIN? Perhaps worth adding some SCEV tests for this evaluation, as a separate patch that we can verify? When SymbolicMax is a SCEVUnknown, it means that the iteration is bounded by some function argument IR value, right? In this case, Exact will also be the same SCEVUnknown, and if we pass INT_MAX when calling the function, the evaluation will wrap, and this is UB anyway? What happens when SymbolicMax is a SCEVCouldNotCompute? I think this will result in a crash with the current code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, just thinking out loud here: for simplicity, let AR = There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for digging into this! One clarification is that we evaluate at BTC = UNSIGNED_MAX. So When we have strides larger than 1, the last accessed address will be something like |
||
// Evaluating AR at an exact BTC is safe: LAA separately checks that | ||
// accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then | ||
// the loop either triggers UB when executing a memory access with a | ||
// poison pointer or the wrapping/poisoned pointer is not used. | ||
ScEnd = AR->evaluateAtIteration(BTC, *SE); | ||
else { | ||
// Evaluating AR at MaxBTC may wrap and create an expression that is less | ||
// than the start of the AddRec due to wrapping (for example consider | ||
// MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess I still don't understand why exact BTC = -2 in the if case above isn't a problem too, although I realise you've tried to explain this to me once already and I do believe that it's an issue. :) It still sounds like there is a deficiency elsewhere in LAA that we're trying to workaround here, but I can live with that for now! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there is computable BTC, we must take the backedge exactly BTC times if the pointer expression would wrap then the original loop is guarnateed to have UB. If we only have a symbolic max BTC, we take the backedge up to BTC times, but the original loop could exit before we wrap and trigger UB. If we would generate start and end pointers based on the symbolic max before the loop, those may wrap and cause incorrect runtime check results. With the computable BTC, the start/end pointers can only wrap if the loop has UB. |
||
// will get incremented by EltSize before returning, so this effectively | ||
// sets ScEnd to the maximum unsigned value for the type. Note that LAA | ||
// separately checks that accesses cannot not wrap, so unsigned max | ||
// represents an upper bound. | ||
if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, | ||
DL)) { | ||
ScEnd = AR->evaluateAtIteration(MaxBTC, *SE); | ||
} else { | ||
ScEnd = SE->getAddExpr( | ||
SE->getNegativeSCEV(EltSizeSCEV), | ||
SE->getSCEV(ConstantExpr::getIntToPtr( | ||
ConstantInt::get(EltSizeSCEV->getType(), -1), AR->getType()))); | ||
} | ||
} | ||
const SCEV *Step = AR->getStepRecurrence(*SE); | ||
|
||
// For expressions with negative step, the upper bound is ScStart and the | ||
|
@@ -232,9 +339,6 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( | |
assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant"); | ||
|
||
// Add the size of the pointed element to ScEnd. | ||
auto &DL = Lp->getHeader()->getDataLayout(); | ||
Type *IdxTy = DL.getIndexType(PtrExpr->getType()); | ||
const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy); | ||
ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); | ||
|
||
std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd}; | ||
|
@@ -250,9 +354,11 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, | |
unsigned DepSetId, unsigned ASId, | ||
PredicatedScalarEvolution &PSE, | ||
bool NeedsFreeze) { | ||
const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); | ||
const auto &[ScStart, ScEnd] = getStartAndEndForAccess( | ||
Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds()); | ||
const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); | ||
const SCEV *BTC = PSE.getBackedgeTakenCount(); | ||
const auto &[ScStart, ScEnd] = | ||
getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, | ||
PSE.getSE(), &DC.getPointerBounds()); | ||
Comment on lines
+357
to
+361
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we changing this because the exact BTC gives better results in some cases? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's changed to differentiate the cases where we can and cannot compute the BTC exactly (there may not be a computable BTC for loops with early exits) |
||
assert(!isa<SCEVCouldNotCompute>(ScStart) && | ||
!isa<SCEVCouldNotCompute>(ScEnd) && | ||
"must be able to compute both start and end expressions"); | ||
|
@@ -1907,11 +2013,14 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( | |
// required for correctness. | ||
if (SE.isLoopInvariant(Src, InnermostLoop) || | ||
SE.isLoopInvariant(Sink, InnermostLoop)) { | ||
const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); | ||
const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess( | ||
InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds); | ||
const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess( | ||
InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds); | ||
const SCEV *BTC = PSE.getBackedgeTakenCount(); | ||
const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); | ||
const auto &[SrcStart_, SrcEnd_] = | ||
getStartAndEndForAccess(InnermostLoop, Src, ATy, BTC, SymbolicMaxBTC, | ||
PSE.getSE(), &PointerBounds); | ||
const auto &[SinkStart_, SinkEnd_] = | ||
getStartAndEndForAccess(InnermostLoop, Sink, BTy, BTC, SymbolicMaxBTC, | ||
PSE.getSE(), &PointerBounds); | ||
if (!isa<SCEVCouldNotCompute>(SrcStart_) && | ||
!isa<SCEVCouldNotCompute>(SrcEnd_) && | ||
!isa<SCEVCouldNotCompute>(SinkStart_) && | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,10 +72,10 @@ define void @all_exits_dominate_latch_countable_exits_at_most_501_iterations_kno | |
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv | ||
; CHECK-NEXT: Grouped accesses: | ||
; CHECK-NEXT: Group GRP0: | ||
; CHECK-NEXT: (Low: %B High: (2004 + %B)) | ||
; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header> | ||
; CHECK-NEXT: Group GRP1: | ||
; CHECK-NEXT: (Low: %A High: (2004 + %A)) | ||
; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header> | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. | ||
|
@@ -131,10 +131,10 @@ define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_not | |
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv | ||
; CHECK-NEXT: Grouped accesses: | ||
; CHECK-NEXT: Group GRP0: | ||
; CHECK-NEXT: (Low: %B High: (2000 + %B)) | ||
; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header> | ||
; CHECK-NEXT: Group GRP1: | ||
; CHECK-NEXT: (Low: %A High: (2000 + %A)) | ||
; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm probably missing something here, but this seems too conservative right? The loop absolutely cannot execute more than 500 times even if we take an early exit. Even before the High of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep we have a bound on the number of iterations and we are guaranteed to not exceed it. One problematic input could be where only the first 10 elements of A and B are deferenceable, and the early exit leaves the loop at iteration 9. If we expand the bounds with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ignoring the fact that the GEPs in the loop explicitly say there is no wrapping (due to inbounds), it sounds like this patch is saying that for early exit loops I think it's worth adding a TODO that we can refine this in future. I think anyone looking at this test might be confused about the discrepancy between normal and early exit loops that have the same mathematical maximum upper bound for memory accesses. Also, after this patch if we have an early exit loop that requires runtime memory checks then there is a very high chance of failing those checks due to the increase in upper bound to UINT64_MAX so it's questionable whether there is even any point vectorising loops with runtime memory checks? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Technically the GEPs say that the result is poison if it is not inbounds. Dereferencing the poison GEP would trigger UB. Say 1000+%A wraps. When there's no early exit, 2000+%A would still wrap, but guaranteed to trigger UB in the original loop, because the loop must access memory range %A..2000+%A. If there is an early exit, the original loop may always exit before accessing 1000+%A, so would not trigger UB, but we would expand 2000+%A, which would have wrapped and the runtime check would incorrect. Hope that makes sense. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I understand. It sounds like you're saying that without the early exit we don't really care if the runtime checks are nonsense or not because the entire loop is UB anyway? Whereas for early exit loops the loop may or may not be UB and so we do care about getting the right runtime checks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep |
||
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header> | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. | ||
|
@@ -247,10 +247,10 @@ define i32 @all_exits_dominate_latch_countable_exits_at_most_1001_iterations_kno | |
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv | ||
; CHECK-NEXT: Grouped accesses: | ||
; CHECK-NEXT: Group GRP0: | ||
; CHECK-NEXT: (Low: %B High: (4004 + %B)) | ||
; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs a TODO because we shouldn't be doing this for dereferenceable pointers, right? Surely if it's guaranteed to be dereferenceable that implies it should not wrap? For example, I'm struggling to see how a C++ object passed by reference to a function could be allocated across a wrapped address space and be legal. I would expect any attempt to actually use the object triggers undefined behaviour in the C++ specification. I realise there's more to life than C++ - this is just one example of course. Also, if I've understood correctly it will significantly impact @huntergr-arm's work to enable vectorisation of early exit loops with loads and stores from dereferenceable memory. Again, I'm happy to accept the patch as is, just saying that we should improve this in future if we ever want to make progress with early exit vectorisaton. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That test was accessing one past the dereferenceable range (see the original bound of 4004 + %B), so I think using There's now a variant that actually executes at most 1000 iterations (for which we don't pessimize the bounds) and this test has been renamed to 1001 iterations. |
||
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header> | ||
; CHECK-NEXT: Group GRP1: | ||
; CHECK-NEXT: (Low: %A High: (4004 + %A)) | ||
; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header> | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. | ||
|
@@ -305,10 +305,10 @@ define i32 @all_exits_dominate_latch_countable_exits_at_most_1000_iterations_not | |
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv | ||
; CHECK-NEXT: Grouped accesses: | ||
; CHECK-NEXT: Group GRP0: | ||
; CHECK-NEXT: (Low: %B High: (4000 + %B)) | ||
; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header> | ||
; CHECK-NEXT: Group GRP1: | ||
; CHECK-NEXT: (Low: %A High: (4000 + %A)) | ||
; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header> | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. | ||
|
@@ -350,6 +350,7 @@ e.2: | |
ret i32 2 | ||
} | ||
|
||
|
||
define i32 @not_all_exits_dominate_latch(ptr %A, ptr %B) { | ||
; CHECK-LABEL: 'not_all_exits_dominate_latch' | ||
; CHECK-NEXT: loop.header: | ||
|
@@ -407,10 +408,10 @@ define i32 @b3_does_not_dominate_latch_known_deref(ptr dereferenceable(4000) %A, | |
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv | ||
; CHECK-NEXT: Grouped accesses: | ||
; CHECK-NEXT: Group GRP0: | ||
; CHECK-NEXT: (Low: %B High: (4004 + %B)) | ||
; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header> | ||
; CHECK-NEXT: Group GRP1: | ||
; CHECK-NEXT: (Low: %A High: (4004 + %A)) | ||
; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header> | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. | ||
|
@@ -462,10 +463,10 @@ define i32 @b3_does_not_dominate_latch_not_known_deref(ptr %A, ptr %B) { | |
; CHECK-NEXT: %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv | ||
; CHECK-NEXT: Grouped accesses: | ||
; CHECK-NEXT: Group GRP0: | ||
; CHECK-NEXT: (Low: %B High: (4004 + %B)) | ||
; CHECK-NEXT: (Low: %B High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%B,+,4}<nuw><%loop.header> | ||
; CHECK-NEXT: Group GRP1: | ||
; CHECK-NEXT: (Low: %A High: (4004 + %A)) | ||
; CHECK-NEXT: (Low: %A High: inttoptr (i64 -1 to ptr)) | ||
; CHECK-NEXT: Member: {%A,+,4}<nuw><%loop.header> | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think addSCEVNoOverflow / mulSCEVNoOverflow would be a better name for these.