-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[CodeGenPrepare] Folding urem
with loop invariant value as remainder
#96625
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-x86 Author: None (goldsteinn) Changes
Patch is 34.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96625.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 900c33b580f15..d2cc3b4cb326c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -471,6 +471,7 @@ class CodeGenPrepare {
bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
CmpInst *Cmp, Intrinsic::ID IID);
bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
+ bool optimizeRem(Instruction *Rem);
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
void verifyBFIUpdates(Function &F);
@@ -1974,6 +1975,157 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
return true;
}
+static bool isRemOfLoopIncrementWithLIV(Value *Rem, const LoopInfo *LI,
+ Value *&RemAmtOut,
+ std::optional<bool> &AddOrSubOut,
+ Value *&AddOrSubOffsetOut,
+ PHINode *&LoopIncrPNOut) {
+ Value *Incr, *RemAmt;
+ if (!isa<Instruction>(Rem))
+ return false;
+ // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
+ if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
+ return false;
+
+ // Only trivially analyzable loops.
+ Loop *L = LI->getLoopFor(cast<Instruction>(Rem)->getParent());
+ if (L == nullptr || L->getLoopPreheader() == nullptr ||
+ L->getLoopLatch() == nullptr)
+ return false;
+
+ std::optional<bool> AddOrSub;
+ Value *AddOrSubOffset;
+ // Find out loop increment PHI.
+ PHINode *PN = dyn_cast<PHINode>(Incr);
+ if (PN != nullptr) {
+ AddOrSub = std::nullopt;
+ AddOrSubOffset = nullptr;
+ } else {
+ // Search through a NUW add/sub.
+ Value *V0, *V1;
+ if (match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1))))
+ AddOrSub = true;
+ else if (match(Incr, m_NUWSub(m_Value(V0), m_Value(V1))))
+ AddOrSub = false;
+ else
+ return false;
+
+ PN = dyn_cast<PHINode>(V0);
+ if (PN != nullptr) {
+ AddOrSubOffset = V1;
+ } else if (*AddOrSub) {
+ PN = dyn_cast<PHINode>(V1);
+ AddOrSubOffset = V0;
+ }
+ }
+
+ if (PN == nullptr)
+ return false;
+
+ // This isn't strictly necessary, what we really need is one increment and any
+ // amount of initial values all being the same.
+ if (PN->getNumIncomingValues() != 2)
+ return false;
+
+ // Only works if the remainder amount is a loop invaraint
+ if (!L->isLoopInvariant(RemAmt))
+ return false;
+
+ // Is the PHI a loop increment?
+ auto LoopIncrInfo = getIVIncrement(PN, LI);
+ if (!LoopIncrInfo.has_value())
+ return false;
+
+ // We need remainder_amount % increment_amount to be zero. Increment of one
+ // satisfies that without any special logic and is overwhelmingly the common
+ // case.
+ if (!match(LoopIncrInfo->second, m_One()))
+ return false;
+
+ // Need the increment to not overflow.
+ if (!match(LoopIncrInfo->first, m_NUWAdd(m_Value(), m_Value())))
+ return false;
+
+ // Set output variables.
+ RemAmtOut = RemAmt;
+ LoopIncrPNOut = PN;
+ AddOrSubOut = AddOrSub;
+ AddOrSubOffsetOut = AddOrSubOffset;
+
+ return true;
+}
+
+// Try to transform:
+//
+// for(i = Start; i < End; ++i)
+// Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;
+//
+// ->
+//
+// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
+// for(i = Start; i < End; ++i, ++rem)
+// Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
+//
+// Currently only implemented for `Start` and `IncrLoopInvariant` being zero.
+static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
+ SmallSet<BasicBlock *, 32> &FreshBBs,
+ bool IsHuge) {
+ std::optional<bool> AddOrSub;
+ Value *AddOrSubOffset, *RemAmt;
+ PHINode *LoopIncrPN;
+ if (!isRemOfLoopIncrementWithLIV(Rem, LI, RemAmt, AddOrSub, AddOrSubOffset,
+ LoopIncrPN))
+ return false;
+
+ // Only non-constant remainder as the extra IV is is probably not profitable
+ // in that case. Further, since remainder amount is non-constant, only handle
+ // case where `IncrLoopInvariant` and `Start` are 0 to entirely eliminate the
+ // rem (as opposed to just hoisting it outside of the loop).
+ //
+ // Potential TODO: Should we have a check for how "nested" this remainder
+ // operation is? The new code runs every iteration so if the remainder is
+ // guarded behind unlikely conditions this might not be worth it.
+ if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant()))
+ return false;
+ Loop *L = LI->getLoopFor(Rem->getParent());
+ if (!match(LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader()),
+ m_Zero()))
+ return false;
+
+ // Create new remainder with induction variable.
+ Type *Ty = Rem->getType();
+ IRBuilder<> Builder(Rem->getContext());
+
+ Builder.SetInsertPoint(LoopIncrPN);
+ PHINode *NewRem = Builder.CreatePHI(Ty, 2);
+
+ Builder.SetInsertPoint(cast<Instruction>(
+ LoopIncrPN->getIncomingValueForBlock(L->getLoopLatch())));
+ // `(add (urem x, y), 1)` is always nuw.
+ Value *RemAdd = Builder.CreateNUWAdd(NewRem, ConstantInt::get(Ty, 1));
+ Value *RemCmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, RemAdd, RemAmt);
+ Value *RemSel =
+ Builder.CreateSelect(RemCmp, Constant::getNullValue(Ty), RemAdd);
+
+ NewRem->addIncoming(Constant::getNullValue(Ty), L->getLoopPreheader());
+ NewRem->addIncoming(RemSel, L->getLoopLatch());
+
+ // Insert all touched BBs.
+ FreshBBs.insert(LoopIncrPN->getParent());
+ FreshBBs.insert(L->getLoopLatch());
+ FreshBBs.insert(Rem->getParent());
+
+ replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
+ Rem->eraseFromParent();
+ return true;
+}
+
+bool CodeGenPrepare::optimizeRem(Instruction *Rem) {
+ if (foldURemOfLoopIncrement(Rem, LI, FreshBBs, IsHugeFunc))
+ return true;
+ return false;
+}
+
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
@@ -8360,6 +8512,11 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
if (optimizeCmp(Cmp, ModifiedDT))
return true;
+ if (match(I, m_URem(m_Value(), m_Value())) ||
+ match(I, m_SRem(m_Value(), m_Value())))
+ if (optimizeRem(I))
+ return true;
+
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
bool Modified = optimizeLoadExt(LI);
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
new file mode 100644
index 0000000000000..9b093ec259201
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -0,0 +1,864 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare void @use.i32(i32)
+declare void @use.2xi64(<2 x i64>)
+declare void @do_stuff0()
+declare void @do_stuff1()
+declare i1 @get.i1()
+declare i32 @get.i32()
+
+define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB0_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r15d, %r15d
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %r12d, %r12d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %edi
+; CHECK-NEXT: callq use.i32@PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %ebx, %r14d
+; CHECK-NEXT: cmovel %r15d, %r14d
+; CHECK-NEXT: incl %r12d
+; CHECK-NEXT: cmpl %r12d, %ebp
+; CHECK-NEXT: jne .LBB0_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB0_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_nested2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB1_8
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r15d, %r15d
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %r12d, %r12d
+; CHECK-NEXT: jmp .LBB1_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB1_5: # %for.body1
+; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: movl %r14d, %edi
+; CHECK-NEXT: callq use.i32@PLT
+; CHECK-NEXT: .LBB1_6: # %for.body.tail
+; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %ebx, %r14d
+; CHECK-NEXT: cmovel %r15d, %r14d
+; CHECK-NEXT: incl %r12d
+; CHECK-NEXT: cmpl %r12d, %ebp
+; CHECK-NEXT: je .LBB1_7
+; CHECK-NEXT: .LBB1_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB1_6
+; CHECK-NEXT: # %bb.3: # %for.body0
+; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB1_5
+; CHECK-NEXT: # %bb.4: # %for.body2
+; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB1_5
+; CHECK-NEXT: jmp .LBB1_6
+; CHECK-NEXT: .LBB1_7:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB1_8: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+ %cond0 = call i1 @get.i1()
+ br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+ %cond1 = call i1 @get.i1()
+ br i1 %cond1, label %for.body1, label %for.body2
+for.body2:
+ %cond2 = call i1 @get.i1()
+ br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ br label %for.body.tail
+for.body.tail:
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr3(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB2_9
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: jmp .LBB2_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_6: # %for.body1
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32@PLT
+; CHECK-NEXT: .LBB2_7: # %for.body.tail
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB2_8
+; CHECK-NEXT: .LBB2_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB2_5
+; CHECK-NEXT: # %bb.3: # %for.body0
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: movl %eax, %r14d
+; CHECK-NEXT: callq get.i32@PLT
+; CHECK-NEXT: testb $1, %r14b
+; CHECK-NEXT: je .LBB2_7
+; CHECK-NEXT: # %bb.4: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: movl %eax, %ebp
+; CHECK-NEXT: incl %ebp
+; CHECK-NEXT: jmp .LBB2_6
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_5: # %for.body2
+; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB2_6
+; CHECK-NEXT: jmp .LBB2_7
+; CHECK-NEXT: .LBB2_8:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB2_9: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %cond0 = call i1 @get.i1()
+ br i1 %cond0, label %for.body0, label %for.body2
+for.body0:
+ %cond1 = call i1 @get.i1()
+ %val = call i32 @get.i32()
+ %inc = add nuw i32 %val, 1
+ br i1 %cond1, label %for.body1, label %for.body.tail
+for.body2:
+ %cond2 = call i1 @get.i1()
+ br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+ %i.04 = phi i32 [ %inc, %for.body0], [ 0, %for.body2 ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ br label %for.body.tail
+for.body.tail:
+ %exitcond.not = call i1 @get.i1()
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_vec:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB3_1: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: callq use.2xi64@PLT
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
+; CHECK-NEXT: psubq %xmm1, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm2, %xmm3
+; CHECK-NEXT: pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; CHECK-NEXT: pand %xmm0, %xmm2
+; CHECK-NEXT: pandn %xmm3, %xmm2
+; CHECK-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: psubq %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT: je .LBB3_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: retq
+entry:
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi <2 x i64> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+ %rem = urem <2 x i64> %i.04, %rem_amt
+ tail call void @use.2xi64(<2 x i64> %rem)
+ %inc = add nuw <2 x i64> %i.04, <i64 1, i64 1>
+ %exitcond.not = call i1 @get.i1()
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB4_6
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: jmp .LBB4_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB4_4: # %for.body.tail
+; CHECK-NEXT: # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: movl %r14d, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: callq use.i32@PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %ebp, %r14d
+; CHECK-NEXT: je .LBB4_5
+; CHECK-NEXT: .LBB4_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: callq get.i1@PLT
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB4_4
+; CHECK-NEXT: # %bb.3: # %for.body0
+; CHECK-NEXT: # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: callq get.i32@PLT
+; CHECK-NEXT: movl %eax, %r14d
+; CHECK-NEXT: jmp .LBB4_4
+; CHECK-NEXT: .LBB4_5:
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB4_6: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp eq i32 %N, 0
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.03 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+ %cond0 = call i1 @get.i1()
+ br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+ %some_val = call i32 @get.i32()
+ br label %for.body.tail
+
+for.body.tail:
+ %i.04 = phi i32 [ %i.03, %for.body ], [ %some_val, %for.body0 ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_second_acc:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $2, %edi
+; CHECK-NEXT: jb .LBB5_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: movl $1, %r15d
+; CHECK-NEXT: xorl %r12d, %r12d
+; CHECK-NEXT: xorl %r14d, %r14d
+; CHECK-NEXT: xorl %r13d, %r13d
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB5_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %r14d, %edi
+; CHECK-NEXT: callq use.i32@PLT
+; CHECK-NEXT: incl %r14d
+; CHECK-NEXT: cmpl %ebx, %r14d
+; CHECK-NEXT: cmovel %r12d, %r14d
+; CHECK-NEXT: incl %r13d
+; CHECK-NEXT: addl $2, %r15d
+; CHECK-NEXT: cmpl %ebp, %r15d
+; CHECK-NEXT: jbe .LBB5_2
+; CHECK-NEXT: # %bb.3:
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .LBB5_4: # %for.cond.cleanup
+; CHECK-NEXT: retq
+entry:
+ %cmp3.not = icmp ult i32 %N, 2
+ br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %i.05 = phi i32 [ %inc2, %for.body ], [ 1, %entry ]
+ %rem = urem i32 %i.04, %rem_amt
+ tail call void @use.i32(i32 %rem)
+ %inc = add nuw i32 %i.04, 1
+ %inc2 = add nuw i32 %i.05, 2
+ %exitcond.not = icmp ugt i32 %inc2, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_srem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_srem:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB6_4
+; CHECK-NEXT: # %bb.1: # %for.body.preheader
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT...
[truncated]
|
urem
with loop invariant value as remainder
The goal is the extend this in the near future. I was hoping to get feedback on the approach. Mostly is CGP a reasonable place to put this? AFAICT LLVM doesn't have much infrastructure for generic strength reduction. The other places I considered where: LoopStrengthReduction, however, seems to only be geared towards strength reduction for the sake of address calculation, and IndVar seems geared towards reduction in the number of IVs, not the other way around. My rationale putting it here are
If this patch looks good (or at least is headed in the right direction) my plan is to add support for non-zero |
Can you try again with "--src-unroll=xxx --tgt-unroll=xxx"? |
Ah, didn't know about that: https://alive2.llvm.org/ce/z/BsyXdJ (anything past i3/unroll of 4 was taking too long). Edit: Was able to verify 6x w/ i4 locally. Think given the arbitrary start/incr/end point that should cover it (along w/ the C++ exhaustive tests). |
Affected files:
|
616d93c
to
c31106b
Compare
NB: llvm-test-suite passes. |
ping |
@@ -0,0 +1,884 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be better to convert this into an opt test.
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s | |
; RUN: opt -codegenprepare -S < %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need the triple (for CGP), but making it an opt test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, not sure why, but most of the tests fail w/ opt. Not sure why that is....
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you using -passes=codegenprepare
? IIRC CGP hasn't been ported to new pass manager.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, the issue is that just opt -codegenprepare
doesn't actually run the loop analysis passes which this optimization requires. I can get it working if I do something like add -licm
. Do you know the arguments to run the loop analysis pass as a pre-req?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LI = &AM.getResult<LoopAnalysis>(F); |
IIRC it always requires LI. It is weird.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llc runs LoopSimplify
before CodeGenPrepare
, which introduces pre-headers for loops in InsertPreheaderForLoop
.
See also
llvm-project/llvm/test/CodeGen/RISCV/O3-pipeline.ll
Lines 42 to 73 in f0eb558
; CHECK-NEXT: Canonicalize natural loops | |
; CHECK-NEXT: Scalar Evolution Analysis | |
; CHECK-NEXT: Loop Pass Manager | |
; CHECK-NEXT: Canonicalize Freeze Instructions in Loops | |
; CHECK-NEXT: Induction Variable Users | |
; CHECK-NEXT: Loop Strength Reduction | |
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) | |
; CHECK-NEXT: Function Alias Analysis Results | |
; CHECK-NEXT: Merge contiguous icmps into a memcmp | |
; CHECK-NEXT: Natural Loop Information | |
; CHECK-NEXT: Lazy Branch Probability Analysis | |
; CHECK-NEXT: Lazy Block Frequency Analysis | |
; CHECK-NEXT: Expand memcmp() to load/stores | |
; CHECK-NEXT: Lower Garbage Collection Instructions | |
; CHECK-NEXT: Shadow Stack GC Lowering | |
; CHECK-NEXT: Lower constant intrinsics | |
; CHECK-NEXT: Remove unreachable blocks from the CFG | |
; CHECK-NEXT: Natural Loop Information | |
; CHECK-NEXT: Post-Dominator Tree Construction | |
; CHECK-NEXT: Branch Probability Analysis | |
; CHECK-NEXT: Block Frequency Analysis | |
; CHECK-NEXT: Constant Hoisting | |
; CHECK-NEXT: Replace intrinsics with calls to vector library | |
; CHECK-NEXT: Partially inline calls to library functions | |
; CHECK-NEXT: Expand vector predication intrinsics | |
; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) | |
; CHECK-NEXT: Scalarize Masked Memory Intrinsics | |
; CHECK-NEXT: Expand reduction intrinsics | |
; CHECK-NEXT: Natural Loop Information | |
; CHECK-NEXT: TLS Variable Hoist | |
; CHECK-NEXT: Type Promotion | |
; CHECK-NEXT: CodeGen Prepare |
Canonicalize natural loops
stands for LoopSimplify
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, ill try loopsimplify + codegenprepare passes for the opt
tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
c31106b
to
446e104
Compare
ping |
446e104
to
3092c9e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we drop the CodeGen/X86
tests?
I kept them because the pipeline isn't quite the same as what we get w/ just |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Please wait for additional approval from other reviewers :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some initial comments, the AddSub stuff is very confusing to me.
llvm/lib/CodeGen/CodeGenPrepare.cpp
Outdated
// Potential TODO(2): Should we have a check for how "nested" this remainder | ||
// operation is? The new code runs every iteration so if the remainder is | ||
// guarded behind unlikely conditions this might not be worth it. | ||
if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm confused. Why does the whole code for determining AddOrSub exist if we just bail out here anyway?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So my initial rationale was isRemOfLoopIncrementWithLoopInvariant
is essentially a matcher and we can hypothetically handle the add/sub pattern but it seemed overkill here.
But I guess there is a fair argument its untested so ill extend the impl to handle if we can simplify the pre-header urem.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove all the unnecessary AddSub handling instead. Add it in a followup if you can prove usefulness.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added test cases for it + enabled it.
A common (motivating) case is:
for(i = 0; i < N; ++i) {
if((i + 1) % RemAmt == 0) do_occasional_thing_but_not_on_first_iterations();
// normal loop body
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Saw your comment late, ill drop if you insist, but think it is a valuable part.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you insist?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, done
f766385
to
e88b6f1
Compare
``` for(i = Start; i < End; ++i) Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant; ``` -> ``` Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant; for(i = Start; i < End; ++i, ++rem) Rem = rem == RemAmtLoopInvariant ? 0 : Rem; ``` In its current state, only if `IncrLoopInvariant` and `Start` both being zero. Alive2 seemed unable to prove this (see: https://alive2.llvm.org/ce/z/ATGDp3 which is clearly wrong but still checks out...) so wrote an exhaustive test here: https://godbolt.org/z/WYa561388
e88b6f1
to
9b88f56
Compare
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - @nikic anything else?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks reasonable to me. I did not inspect test coverage in detail.
|
||
// Find out loop increment PHI. | ||
auto *PN = dyn_cast<PHINode>(Incr); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Drop this newline.
|
||
// Only trivially analyzable loops. | ||
Loop *L = LI->getLoopFor(Rem->getParent()); | ||
if (!L || !L->getLoopPreheader() || !L->getLoopLatch()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC getLoopPreheader is expensive, so maybe move this after the phi check.
|
||
// getIVIncrement finds the loop at PN->getParent(). This might be a different | ||
// loop from the loop with Rem->getParent(). | ||
if (L->getHeader() != PN->getParent()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think technically the right thing to do would be to have L
be the loop of the phi node, not the rem, and only check that L also contains rem (which is not guaranteed as we don't require LCSSA in CGP). The loop of the phi node is really what you care about, the urem itself can be in a nested loop without issue (I think?)
But I agree it's best to ignore this case for now.
@goldsteinn Looks like c64ce8b might break aarch64 stage2. Guessing miscompilation. |
Yeah I see, I am trying to repro locally. I'll revert for now as not sure of the exact issue. |
Reverted: 731ae69 |
Im pretty sure I see the bug. The issue is that while |
No bug was different:
If the loop was multiple BBs we would be inserting rem in wrong locations... |
Actually still not 100% sure, still havent built a repro. |
Okay fairly certain I found the bug. Reproduced (ish) with a cross build and think the issue is the
|
urem
with loop invariant value; NFCurem
with loop invariant value as remainder->
In its current state, only if
IncrLoopInvariant
is zero.Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout): https://alive2.llvm.org/ce/z/6tgyN3
Exhaust proof over all
uint8_t
combinations in C++: https://godbolt.org/z/WYa561388