Skip to content

Commit 1e072ae

Browse files
authored
[CGP] [CodeGenPrepare] Folding urem with loop invariant value plus offset (#104724)
This extends the existing fold: ``` for(i = Start; i < End; ++i) Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant; ``` -> ``` Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant; for(i = Start; i < End; ++i, ++rem) Rem = rem == RemAmtLoopInvariant ? 0 : Rem; ``` To work with a non-zero `IncrLoopInvariant`. This is a common usage in cases such as: ``` for(i = 0; i < N; ++i) if ((i + 1) % X) == 0) do_something_occasionally_but_not_first_iter(); ``` Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout): https://alive2.llvm.org/ce/z/6tgyN3 Exhaust proof over all uint8_t combinations in C++: https://godbolt.org/z/WYa561388
1 parent 0ab44fd commit 1e072ae

File tree

2 files changed

+58
-16
lines changed

2 files changed

+58
-16
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1981,17 +1981,36 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
19811981
return true;
19821982
}
19831983

1984-
static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
1985-
const LoopInfo *LI,
1986-
Value *&RemAmtOut,
1987-
PHINode *&LoopIncrPNOut) {
1984+
static bool isRemOfLoopIncrementWithLoopInvariant(
1985+
Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut, Value *&AddInstOut,
1986+
Value *&AddOffsetOut, PHINode *&LoopIncrPNOut) {
19881987
Value *Incr, *RemAmt;
19891988
// NB: If RemAmt is a power of 2 it *should* have been transformed by now.
19901989
if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
19911990
return false;
19921991

1992+
Value *AddInst, *AddOffset;
19931993
// Find out loop increment PHI.
19941994
auto *PN = dyn_cast<PHINode>(Incr);
1995+
if (PN != nullptr) {
1996+
AddInst = nullptr;
1997+
AddOffset = nullptr;
1998+
} else {
1999+
// Search through a NUW add on top of the loop increment.
2000+
Value *V0, *V1;
2001+
if (!match(Incr, m_NUWAdd(m_Value(V0), m_Value(V1))))
2002+
return false;
2003+
2004+
AddInst = Incr;
2005+
PN = dyn_cast<PHINode>(V0);
2006+
if (PN != nullptr) {
2007+
AddOffset = V1;
2008+
} else {
2009+
PN = dyn_cast<PHINode>(V1);
2010+
AddOffset = V0;
2011+
}
2012+
}
2013+
19952014
if (!PN)
19962015
return false;
19972016

@@ -2031,6 +2050,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
20312050
// Set output variables.
20322051
RemAmtOut = RemAmt;
20332052
LoopIncrPNOut = PN;
2053+
AddInstOut = AddInst;
2054+
AddOffsetOut = AddOffset;
20342055

20352056
return true;
20362057
}
@@ -2045,15 +2066,14 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
20452066
// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
20462067
// for(i = Start; i < End; ++i, ++rem)
20472068
// Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
2048-
//
2049-
// Currently only implemented for `IncrLoopInvariant` being zero.
20502069
static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
20512070
const LoopInfo *LI,
20522071
SmallSet<BasicBlock *, 32> &FreshBBs,
20532072
bool IsHuge) {
2054-
Value *RemAmt;
2073+
Value *AddOffset, *RemAmt, *AddInst;
20552074
PHINode *LoopIncrPN;
2056-
if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, LoopIncrPN))
2075+
if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, AddInst,
2076+
AddOffset, LoopIncrPN))
20572077
return false;
20582078

20592079
// Only non-constant remainder as the extra IV is probably not profitable
@@ -2071,6 +2091,23 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
20712091

20722092
Loop *L = LI->getLoopFor(LoopIncrPN->getParent());
20732093
Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader());
2094+
// If we have add create initial value for remainder.
2095+
// The logic here is:
2096+
// (urem (add nuw Start, IncrLoopInvariant), RemAmtLoopInvariant
2097+
//
2098+
// Only proceed if the expression simplifies (otherwise we can't fully
2099+
// optimize out the urem).
2100+
if (AddInst) {
2101+
assert(AddOffset && "We found an add but missing values");
2102+
// Without dom-condition/assumption cache we aren't likely to get much out
2103+
// of a context instruction.
2104+
Start = simplifyAddInst(Start, AddOffset,
2105+
match(AddInst, m_NSWAdd(m_Value(), m_Value())),
2106+
/*IsNUW=*/true, *DL);
2107+
if (!Start)
2108+
return false;
2109+
}
2110+
20742111
// If we can't fully optimize out the `rem`, skip this transform.
20752112
Start = simplifyURemInst(Start, RemAmt, *DL);
20762113
if (!Start)
@@ -2098,9 +2135,12 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
20982135
FreshBBs.insert(LoopIncrPN->getParent());
20992136
FreshBBs.insert(L->getLoopLatch());
21002137
FreshBBs.insert(Rem->getParent());
2101-
2138+
if (AddInst)
2139+
FreshBBs.insert(cast<Instruction>(AddInst)->getParent());
21022140
replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
21032141
Rem->eraseFromParent();
2142+
if (AddInst && AddInst->use_empty())
2143+
cast<Instruction>(AddInst)->eraseFromParent();
21042144
return true;
21052145
}
21062146

llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -319,20 +319,20 @@ for.body.tail:
319319
define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
320320
; CHECK-LABEL: define void @simple_urem_to_sel_vec(
321321
; CHECK-SAME: <2 x i64> [[REM_AMT:%.*]]) #[[ATTR0]] {
322-
; CHECK-NEXT: [[FOR_COND_CLEANUP:.*]]:
322+
; CHECK-NEXT: [[ENTRY:.*]]:
323323
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
324-
; CHECK: [[ENTRY:.*]]:
324+
; CHECK: [[FOR_COND_CLEANUP:.*]]:
325325
; CHECK-NEXT: ret void
326326
; CHECK: [[FOR_BODY]]:
327-
; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[FOR_COND_CLEANUP]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
328-
; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[FOR_COND_CLEANUP]] ]
327+
; CHECK-NEXT: [[REM:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
328+
; CHECK-NEXT: [[I_04:%.*]] = phi <2 x i64> [ [[INC:%.*]], %[[FOR_BODY]] ], [ zeroinitializer, %[[ENTRY]] ]
329329
; CHECK-NEXT: tail call void @use.2xi64(<2 x i64> [[REM]])
330330
; CHECK-NEXT: [[TMP1:%.*]] = add nuw <2 x i64> [[REM]], <i64 1, i64 1>
331331
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], [[REM_AMT]]
332332
; CHECK-NEXT: [[TMP3]] = select <2 x i1> [[TMP2]], <2 x i64> zeroinitializer, <2 x i64> [[TMP1]]
333333
; CHECK-NEXT: [[INC]] = add nuw <2 x i64> [[I_04]], <i64 1, i64 1>
334334
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = call i1 @get.i1()
335-
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[ENTRY]], label %[[FOR_BODY]]
335+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
336336
;
337337
entry:
338338
br label %for.body
@@ -892,10 +892,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
892892
; CHECK: [[FOR_COND_CLEANUP]]:
893893
; CHECK-NEXT: ret void
894894
; CHECK: [[FOR_BODY]]:
895+
; CHECK-NEXT: [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
895896
; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
896-
; CHECK-NEXT: [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
897-
; CHECK-NEXT: [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
898897
; CHECK-NEXT: tail call void @use.i32(i32 [[REM]])
898+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[REM]], 1
899+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
900+
; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
899901
; CHECK-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
900902
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
901903
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]

0 commit comments

Comments
 (0)