Skip to content

Commit f5dd70c

Browse files
authored
[LSR] Require non-zero step when considering wrap around for term folding (#77809)
The term folding logic needs to prove that the induction variable does not cycle through the same set of values so that testing for the value of the IV on the exiting iteration is guaranteed to trigger only on that iteration. The prior code checked the no-self-wrap property on the IV, but this is insufficient as a zero step is trivially no-self-wrap per SCEV's definition but does repeat the same series of values. In the current form, this has the effect of basically disabling lsr's term-folding for all non-constant strides. This is still a net improvement as we've disabled term-folding entirely, so being able to enable it for constant strides is still a net improvement. As future work, there's two SCEV weakness worth investigating. First sext (or i32 %a, 1) to i64 does not return true for isKnownNonZero. This is because we check only the unsigned range in that query. We could either do query pushdown, or check the signed range as well. I tried the second locally and it has very broad impact - i.e. we have a bunch of missing optimizations here. Second, zext (or i32 %a, 1) to i64 as the increment to the IV in expensive_expand_short_tc causes the addrec to no longer be provably no-self-wrap. I didn't investigate this so it might be necessary, but the loop structure is such that I find this result surprising.
1 parent ee45710 commit f5dd70c

File tree

2 files changed

+32
-34
lines changed

2 files changed

+32
-34
lines changed

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6816,7 +6816,8 @@ canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
68166816
// iteration. The simplest case to consider is a candidate IV which is
68176817
// narrower than the trip count (and thus original IV), but this can
68186818
// also happen due to non-unit strides on the candidate IVs.
6819-
if (!AddRec->hasNoSelfWrap())
6819+
if (!AddRec->hasNoSelfWrap() ||
6820+
!SE.isKnownNonZero(AddRec->getStepRecurrence(SE)))
68206821
continue;
68216822

68226823
const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);

llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll

Lines changed: 30 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -474,28 +474,27 @@ for.end: ; preds = %for.body
474474
ret void
475475
}
476476

477+
; TODO: This case should be legal, but we run into a problem with SCEV's
478+
; ability to prove non-zero for sext expressions.
477479
define void @expensive_expand_short_tc(ptr %a, i32 %offset, i32 %n) {
478480
; CHECK-LABEL: @expensive_expand_short_tc(
479481
; CHECK-NEXT: entry:
482+
; CHECK-NEXT: [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
480483
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
481-
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1
482-
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
483-
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
484-
; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
485-
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
486-
; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
487-
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
488484
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
489485
; CHECK: for.body:
490486
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
487+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
491488
; CHECK-NEXT: store i32 1, ptr [[LSR_IV1]], align 4
492-
; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
493-
; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
494-
; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]]
489+
; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
490+
; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
491+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
492+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]]
495493
; CHECK: for.end:
496494
; CHECK-NEXT: ret void
497495
;
498496
entry:
497+
%offset.nonzero = or i32 %offset, 1
499498
%uglygep = getelementptr i8, ptr %a, i64 84
500499
br label %for.body
501500

@@ -504,36 +503,35 @@ for.body: ; preds = %for.body, %entry
504503
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
505504
store i32 1, ptr %lsr.iv1, align 4
506505
%lsr.iv.next = add nsw i32 %lsr.iv, 1
507-
%uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
506+
%uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
508507
%exitcond.not = icmp eq i32 %lsr.iv.next, %n
509508
br i1 %exitcond.not, label %for.end, label %for.body, !prof !{!"branch_weights", i32 1, i32 3}
510509

511510
for.end: ; preds = %for.body
512511
ret void
513512
}
514513

514+
; TODO: This case should be legal, but we run into a problem with SCEV's
515+
; ability to prove non-zero for sext expressions.
515516
define void @expensive_expand_long_tc(ptr %a, i32 %offset, i32 %n) {
516517
; CHECK-LABEL: @expensive_expand_long_tc(
517518
; CHECK-NEXT: entry:
519+
; CHECK-NEXT: [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
518520
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
519-
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1
520-
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
521-
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
522-
; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
523-
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
524-
; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
525-
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
526521
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
527522
; CHECK: for.body:
528523
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
524+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
529525
; CHECK-NEXT: store i32 1, ptr [[LSR_IV1]], align 4
530-
; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
531-
; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
532-
; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF1:![0-9]+]]
526+
; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
527+
; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
528+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
529+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF1:![0-9]+]]
533530
; CHECK: for.end:
534531
; CHECK-NEXT: ret void
535532
;
536533
entry:
534+
%offset.nonzero = or i32 %offset, 1
537535
%uglygep = getelementptr i8, ptr %a, i64 84
538536
br label %for.body
539537

@@ -542,36 +540,35 @@ for.body: ; preds = %for.body, %entry
542540
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
543541
store i32 1, ptr %lsr.iv1, align 4
544542
%lsr.iv.next = add nsw i32 %lsr.iv, 1
545-
%uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
543+
%uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
546544
%exitcond.not = icmp eq i32 %lsr.iv.next, %n
547545
br i1 %exitcond.not, label %for.end, label %for.body, !prof !{!"branch_weights", i32 1, i32 300}
548546

549547
for.end: ; preds = %for.body
550548
ret void
551549
}
552550

551+
; TODO: This case should be legal, but we run into a problem with SCEV's
552+
; ability to prove non-zero for sext expressions.
553553
define void @expensive_expand_unknown_tc(ptr %a, i32 %offset, i32 %n) {
554554
; CHECK-LABEL: @expensive_expand_unknown_tc(
555555
; CHECK-NEXT: entry:
556+
; CHECK-NEXT: [[OFFSET_NONZERO:%.*]] = or i32 [[OFFSET:%.*]], 1
556557
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 84
557-
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1
558-
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
559-
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
560-
; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[OFFSET:%.*]] to i64
561-
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]]
562-
; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], 84
563-
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
564558
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
565559
; CHECK: for.body:
566560
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[UGLYGEP]], [[ENTRY:%.*]] ]
561+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
567562
; CHECK-NEXT: store i32 1, ptr [[LSR_IV1]], align 4
568-
; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET]]
569-
; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]]
570-
; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
563+
; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
564+
; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i32 [[OFFSET_NONZERO]]
565+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], [[N:%.*]]
566+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
571567
; CHECK: for.end:
572568
; CHECK-NEXT: ret void
573569
;
574570
entry:
571+
%offset.nonzero = or i32 %offset, 1
575572
%uglygep = getelementptr i8, ptr %a, i64 84
576573
br label %for.body
577574

@@ -580,7 +577,7 @@ for.body: ; preds = %for.body, %entry
580577
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 0, %entry ]
581578
store i32 1, ptr %lsr.iv1, align 4
582579
%lsr.iv.next = add nsw i32 %lsr.iv, 1
583-
%uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset
580+
%uglygep2 = getelementptr i8, ptr %lsr.iv1, i32 %offset.nonzero
584581
%exitcond.not = icmp eq i32 %lsr.iv.next, %n
585582
br i1 %exitcond.not, label %for.end, label %for.body
586583

0 commit comments

Comments
 (0)