Skip to content

Commit 191df4d

Browse files
committed
[CGP][CodeGenPrepare] Folding urem with loop invariant value plus offset
This extends the existing fold: ``` for(i = Start; i < End; ++i) Rem = (i nuw+- IncrLoopInvariant) u% RemAmtLoopInvariant; ``` -> ``` Rem = (Start nuw+- IncrLoopInvariant) % RemAmtLoopInvariant; for(i = Start; i < End; ++i, ++rem) Rem = rem == RemAmtLoopInvariant ? 0 : Rem; ``` To work with a non-zero `IncrLoopInvariant`. This is a common usage in cases such as: ``` for(i = 0; i < N; ++i) if ((i + 1) % X) == 0) do_something_occasionally_but_not_first_iter(); ``` Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout): https://alive2.llvm.org/ce/z/6tgyN3 Exhaust proof over all uint8_t combinations in C++: https://godbolt.org/z/WYa561388
1 parent ae15931 commit 191df4d

File tree

2 files changed

+87
-14
lines changed

2 files changed

+87
-14
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 75 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1976,17 +1976,43 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
19761976
return true;
19771977
}
19781978

1979-
static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
1980-
const LoopInfo *LI,
1981-
Value *&RemAmtOut,
1982-
PHINode *&LoopIncrPNOut) {
1979+
static bool isRemOfLoopIncrementWithLoopInvariant(
1980+
Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut, bool &AddOrSubOut,
1981+
Value *&AddOrSubInstOut, Value *&AddOrSubOffsetOut,
1982+
PHINode *&LoopIncrPNOut) {
19831983
Value *Incr, *RemAmt;
19841984
// NB: If RemAmt is a power of 2 it *should* have been transformed by now.
19851985
if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
19861986
return false;
19871987

1988+
bool AddOrSub = false;
1989+
Value *AddOrSubOffset;
19881990
// Find out loop increment PHI.
19891991
auto *PN = dyn_cast<PHINode>(Incr);
1992+
if (PN != nullptr) {
1993+
AddOrSub = false;
1994+
AddOrSubOffset = nullptr;
1995+
} else {
1996+
// Search through a NUW add/sub on top of the loop increment.
1997+
Value *V0, *V1;
1998+
bool Add = match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1)));
1999+
bool Sub = match(Incr, m_NUWSub(m_Value(V0), m_Value(V1)));
2000+
if (!Add && !Sub)
2001+
return false;
2002+
2003+
AddOrSub = true;
2004+
2005+
AddOrSubInstOut = Incr;
2006+
2007+
PN = dyn_cast<PHINode>(V0);
2008+
if (PN != nullptr) {
2009+
AddOrSubOffset = V1;
2010+
} else if (Add) {
2011+
PN = dyn_cast<PHINode>(V1);
2012+
AddOrSubOffset = V0;
2013+
}
2014+
}
2015+
19902016
if (!PN)
19912017
return false;
19922018

@@ -2026,6 +2052,8 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
20262052
// Set output variables.
20272053
RemAmtOut = RemAmt;
20282054
LoopIncrPNOut = PN;
2055+
AddOrSubOut = AddOrSub;
2056+
AddOrSubOffsetOut = AddOrSubOffset;
20292057

20302058
return true;
20312059
}
@@ -2040,15 +2068,15 @@ static bool isRemOfLoopIncrementWithLoopInvariant(Instruction *Rem,
20402068
// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
20412069
// for(i = Start; i < End; ++i, ++rem)
20422070
// Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
2043-
//
2044-
// Currently only implemented for `IncrLoopInvariant` being zero.
20452071
static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
20462072
const LoopInfo *LI,
20472073
SmallSet<BasicBlock *, 32> &FreshBBs,
20482074
bool IsHuge) {
2049-
Value *RemAmt;
2075+
bool AddOrSub;
2076+
Value *AddOrSubOffset, *RemAmt, *AddOrSubInst;
20502077
PHINode *LoopIncrPN;
2051-
if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmt, LoopIncrPN))
2078+
if (!isRemOfLoopIncrementWithLoopInvariant(
2079+
Rem, LI, RemAmt, AddOrSub, AddOrSubInst, AddOrSubOffset, LoopIncrPN))
20522080
return false;
20532081

20542082
// Only non-constant remainder as the extra IV is probably not profitable
@@ -2066,6 +2094,43 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
20662094

20672095
Loop *L = LI->getLoopFor(LoopIncrPN->getParent());
20682096
Value *Start = LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader());
2097+
// If we have add/sub create initial value for remainder.
2098+
// The logic here is:
2099+
// (urem (add/sub nuw Start, IncrLoopInvariant), RemAmtLoopInvariant
2100+
//
2101+
// Only proceed if the expression simplifies (otherwise we can't fully
2102+
// optimize out the urem).
2103+
if (AddOrSub) {
2104+
assert(AddOrSubOffset && AddOrSubInst &&
2105+
"We found an add/sub but missing values");
2106+
// Without dom-condition/assumption cache we aren't likely to get much out
2107+
// of a context instruction.
2108+
const SimplifyQuery Q(*DL);
2109+
Instruction::BinaryOps Opc =
2110+
cast<BinaryOperator>(AddOrSubInst)->getOpcode();
2111+
switch (Opc) {
2112+
case Instruction::Add:
2113+
Start =
2114+
simplifyAddInst(Start, AddOrSubOffset,
2115+
match(AddOrSubInst, m_NSWAdd(m_Value(), m_Value())),
2116+
/*IsNUW=*/true, Q);
2117+
break;
2118+
case Instruction::Sub:
2119+
Start =
2120+
simplifySubInst(Start, AddOrSubOffset,
2121+
match(AddOrSubInst, m_NSWSub(m_Value(), m_Value())),
2122+
/*IsNUW=*/true, Q);
2123+
break;
2124+
case Instruction::Or:
2125+
Start = simplifyOrInst(Start, AddOrSubOffset, Q);
2126+
break;
2127+
default:
2128+
llvm_unreachable("Unknown offset instruction");
2129+
}
2130+
if (!Start)
2131+
return false;
2132+
}
2133+
20692134
// If we can't fully optimize out the `rem`, skip this transform.
20702135
Start = simplifyURemInst(Start, RemAmt, *DL);
20712136
if (!Start)
@@ -2096,6 +2161,8 @@ static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
20962161

20972162
replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
20982163
Rem->eraseFromParent();
2164+
if (AddOrSubInst && AddOrSubInst->use_empty())
2165+
cast<Instruction>(AddOrSubInst)->eraseFromParent();
20992166
return true;
21002167
}
21012168

llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -892,10 +892,12 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
892892
; CHECK: [[FOR_COND_CLEANUP]]:
893893
; CHECK-NEXT: ret void
894894
; CHECK: [[FOR_BODY]]:
895+
; CHECK-NEXT: [[REM:%.*]] = phi i32 [ 7, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
895896
; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
896-
; CHECK-NEXT: [[I_WITH_OFF:%.*]] = add nuw i32 [[I_04]], 5
897-
; CHECK-NEXT: [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
898897
; CHECK-NEXT: tail call void @use.i32(i32 [[REM]])
898+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[REM]], 1
899+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
900+
; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
899901
; CHECK-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
900902
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
901903
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -930,10 +932,12 @@ define void @simple_urem_to_sel_non_zero_start_through_dis_or(i32 %N, i32 %rem_a
930932
; CHECK: [[FOR_COND_CLEANUP]]:
931933
; CHECK-NEXT: ret void
932934
; CHECK: [[FOR_BODY]]:
935+
; CHECK-NEXT: [[REM:%.*]] = phi i32 [ 10, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
933936
; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 2, %[[FOR_BODY_PREHEADER]] ]
934-
; CHECK-NEXT: [[I_WITH_OFF:%.*]] = or disjoint i32 [[I_04]], 8
935-
; CHECK-NEXT: [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
936937
; CHECK-NEXT: tail call void @use.i32(i32 [[REM]])
938+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[REM]], 1
939+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
940+
; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
937941
; CHECK-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
938942
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
939943
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
@@ -1041,10 +1045,12 @@ define void @simple_urem_to_sel_non_zero_start_through_sub(i32 %N, i32 %rem_amt,
10411045
; CHECK: [[FOR_COND_CLEANUP]]:
10421046
; CHECK-NEXT: ret void
10431047
; CHECK: [[FOR_BODY]]:
1048+
; CHECK-NEXT: [[REM:%.*]] = phi i32 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[TMP3:%.*]], %[[FOR_BODY]] ]
10441049
; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ]
1045-
; CHECK-NEXT: [[I_WITH_OFF:%.*]] = sub nuw i32 [[I_04]], [[START]]
1046-
; CHECK-NEXT: [[REM:%.*]] = urem i32 [[I_WITH_OFF]], [[REM_AMT]]
10471050
; CHECK-NEXT: tail call void @use.i32(i32 [[REM]])
1051+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[REM]], 1
1052+
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[REM_AMT]]
1053+
; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
10481054
; CHECK-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
10491055
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
10501056
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]

0 commit comments

Comments
 (0)