[CodeGenPrepare] Folding `urem` with loop invariant value as remainder #96625

goldsteinn · 2024-06-25T12:08:45Z

[CodeGenPrepare][X86] Add tests for folding urem with loop invariant value; NFC
[CodeGenPrepare] Folding urem with loop invariant value as remainder

for(i = Start; i < End; ++i)
   Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;

->

Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
for(i = Start; i < End; ++i, ++rem)
   Rem = rem == RemAmtLoopInvariant ? 0 : Rem;

In its current state, only if IncrLoopInvariant is zero.

Alive2 w/ i4/unrolled 6x (needs to be ran locally due to timeout): https://alive2.llvm.org/ce/z/6tgyN3
Exhaust proof over all uint8_t combinations in C++: https://godbolt.org/z/WYa561388

llvmbot · 2024-06-25T12:09:27Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-x86

Author: None (goldsteinn)

Changes

[CodeGenPrepare][X86] Add tests for folding urem with loop invariant value; NFC
[CodeGenPrepare] Folding urem with loop invariant value

Patch is 34.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96625.diff

2 Files Affected:

(modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+157)
(added) llvm/test/CodeGen/X86/fold-loop-of-urem.ll (+864)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 900c33b580f15..d2cc3b4cb326c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -471,6 +471,7 @@ class CodeGenPrepare {
   bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
                                    CmpInst *Cmp, Intrinsic::ID IID);
   bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool optimizeRem(Instruction *Rem);
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   void verifyBFIUpdates(Function &F);
@@ -1974,6 +1975,157 @@ static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
   return true;
 }
 
+static bool isRemOfLoopIncrementWithLIV(Value *Rem, const LoopInfo *LI,
+                                        Value *&RemAmtOut,
+                                        std::optional<bool> &AddOrSubOut,
+                                        Value *&AddOrSubOffsetOut,
+                                        PHINode *&LoopIncrPNOut) {
+  Value *Incr, *RemAmt;
+  if (!isa<Instruction>(Rem))
+    return false;
+  // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
+  if (!match(Rem, m_URem(m_Value(Incr), m_Value(RemAmt))))
+    return false;
+
+  // Only trivially analyzable loops.
+  Loop *L = LI->getLoopFor(cast<Instruction>(Rem)->getParent());
+  if (L == nullptr || L->getLoopPreheader() == nullptr ||
+      L->getLoopLatch() == nullptr)
+    return false;
+
+  std::optional<bool> AddOrSub;
+  Value *AddOrSubOffset;
+  // Find out loop increment PHI.
+  PHINode *PN = dyn_cast<PHINode>(Incr);
+  if (PN != nullptr) {
+    AddOrSub = std::nullopt;
+    AddOrSubOffset = nullptr;
+  } else {
+    // Search through a NUW add/sub.
+    Value *V0, *V1;
+    if (match(Incr, m_NUWAddLike(m_Value(V0), m_Value(V1))))
+      AddOrSub = true;
+    else if (match(Incr, m_NUWSub(m_Value(V0), m_Value(V1))))
+      AddOrSub = false;
+    else
+      return false;
+
+    PN = dyn_cast<PHINode>(V0);
+    if (PN != nullptr) {
+      AddOrSubOffset = V1;
+    } else if (*AddOrSub) {
+      PN = dyn_cast<PHINode>(V1);
+      AddOrSubOffset = V0;
+    }
+  }
+
+  if (PN == nullptr)
+    return false;
+
+  // This isn't strictly necessary, what we really need is one increment and any
+  // amount of initial values all being the same.
+  if (PN->getNumIncomingValues() != 2)
+    return false;
+
+  // Only works if the remainder amount is a loop invaraint
+  if (!L->isLoopInvariant(RemAmt))
+    return false;
+
+  // Is the PHI a loop increment?
+  auto LoopIncrInfo = getIVIncrement(PN, LI);
+  if (!LoopIncrInfo.has_value())
+    return false;
+
+  // We need remainder_amount % increment_amount to be zero. Increment of one
+  // satisfies that without any special logic and is overwhelmingly the common
+  // case.
+  if (!match(LoopIncrInfo->second, m_One()))
+    return false;
+
+  // Need the increment to not overflow.
+  if (!match(LoopIncrInfo->first, m_NUWAdd(m_Value(), m_Value())))
+    return false;
+
+  // Set output variables.
+  RemAmtOut = RemAmt;
+  LoopIncrPNOut = PN;
+  AddOrSubOut = AddOrSub;
+  AddOrSubOffsetOut = AddOrSubOffset;
+
+  return true;
+}
+
+// Try to transform:
+//
+// for(i = Start; i < End; ++i)
+//    Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;
+//
+// ->
+//
+// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
+// for(i = Start; i < End; ++i, ++rem)
+//    Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
+//
+// Currently only implemented for `Start` and `IncrLoopInvariant` being zero.
+static bool foldURemOfLoopIncrement(Instruction *Rem, const LoopInfo *LI,
+                                   SmallSet<BasicBlock *, 32> &FreshBBs,
+                                   bool IsHuge) {
+  std::optional<bool> AddOrSub;
+  Value *AddOrSubOffset, *RemAmt;
+  PHINode *LoopIncrPN;
+  if (!isRemOfLoopIncrementWithLIV(Rem, LI, RemAmt, AddOrSub, AddOrSubOffset,
+                                   LoopIncrPN))
+    return false;
+
+  // Only non-constant remainder as the extra IV is is probably not profitable
+  // in that case. Further, since remainder amount is non-constant, only handle
+  // case where `IncrLoopInvariant` and `Start` are 0 to entirely eliminate the
+  // rem (as opposed to just hoisting it outside of the loop).
+  //
+  // Potential TODO: Should we have a check for how "nested" this remainder
+  // operation is? The new code runs every iteration so if the remainder is
+  // guarded behind unlikely conditions this might not be worth it.
+  if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant()))
+    return false;
+  Loop *L = LI->getLoopFor(Rem->getParent());
+  if (!match(LoopIncrPN->getIncomingValueForBlock(L->getLoopPreheader()),
+             m_Zero()))
+    return false;
+
+  // Create new remainder with induction variable.
+  Type *Ty = Rem->getType();
+  IRBuilder<> Builder(Rem->getContext());
+
+  Builder.SetInsertPoint(LoopIncrPN);
+  PHINode *NewRem = Builder.CreatePHI(Ty, 2);
+
+  Builder.SetInsertPoint(cast<Instruction>(
+      LoopIncrPN->getIncomingValueForBlock(L->getLoopLatch())));
+  // `(add (urem x, y), 1)` is always nuw.
+  Value *RemAdd = Builder.CreateNUWAdd(NewRem, ConstantInt::get(Ty, 1));
+  Value *RemCmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, RemAdd, RemAmt);
+  Value *RemSel =
+      Builder.CreateSelect(RemCmp, Constant::getNullValue(Ty), RemAdd);
+
+  NewRem->addIncoming(Constant::getNullValue(Ty), L->getLoopPreheader());
+  NewRem->addIncoming(RemSel, L->getLoopLatch());
+
+  // Insert all touched BBs.
+  FreshBBs.insert(LoopIncrPN->getParent());
+  FreshBBs.insert(L->getLoopLatch());
+  FreshBBs.insert(Rem->getParent());
+
+  replaceAllUsesWith(Rem, NewRem, FreshBBs, IsHuge);
+  Rem->eraseFromParent();
+  return true;
+}
+
+bool CodeGenPrepare::optimizeRem(Instruction *Rem) {
+  if (foldURemOfLoopIncrement(Rem, LI, FreshBBs, IsHugeFunc))
+    return true;
+  return false;
+}
+
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
@@ -8360,6 +8512,11 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (optimizeCmp(Cmp, ModifiedDT))
       return true;
 
+  if (match(I, m_URem(m_Value(), m_Value())) ||
+      match(I, m_SRem(m_Value(), m_Value())))
+    if (optimizeRem(I))
+      return true;
+
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     bool Modified = optimizeLoadExt(LI);
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
new file mode 100644
index 0000000000000..9b093ec259201
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -0,0 +1,864 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare void @use.i32(i32)
+declare void @use.2xi64(<2 x i64>)
+declare void @do_stuff0()
+declare void @do_stuff1()
+declare i1 @get.i1()
+declare i32 @get.i32()
+
+define void @simple_urem_to_sel(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq use.i32@PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r15d, %r14d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl %r12d, %ebp
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB0_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_nested2(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_nested2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB1_8
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_5: # %for.body1
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq use.i32@PLT
+; CHECK-NEXT:  .LBB1_6: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r15d, %r14d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl %r12d, %ebp
+; CHECK-NEXT:    je .LBB1_7
+; CHECK-NEXT:  .LBB1_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_6
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB1_5
+; CHECK-NEXT:  # %bb.4: # %for.body2
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB1_5
+; CHECK-NEXT:    jmp .LBB1_6
+; CHECK-NEXT:  .LBB1_7:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB1_8: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+  %cond1 = call i1 @get.i1()
+  br i1 %cond1, label %for.body1, label %for.body2
+for.body2:
+  %cond2 = call i1 @get.i1()
+  br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  br label %for.body.tail
+for.body.tail:
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr3(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB2_9
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    jmp .LBB2_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB2_6: # %for.body1
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32@PLT
+; CHECK-NEXT:  .LBB2_7: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB2_8
+; CHECK-NEXT:  .LBB2_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB2_5
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    callq get.i32@PLT
+; CHECK-NEXT:    testb $1, %r14b
+; CHECK-NEXT:    je .LBB2_7
+; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    jmp .LBB2_6
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB2_5: # %for.body2
+; CHECK-NEXT:    # in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    jne .LBB2_6
+; CHECK-NEXT:    jmp .LBB2_7
+; CHECK-NEXT:  .LBB2_8:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB2_9: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body2
+for.body0:
+  %cond1 = call i1 @get.i1()
+  %val = call i32 @get.i32()
+  %inc = add nuw i32 %val, 1
+  br i1 %cond1, label %for.body1, label %for.body.tail
+for.body2:
+  %cond2 = call i1 @get.i1()
+  br i1 %cond2, label %for.body1, label %for.body.tail
+for.body1:
+  %i.04 = phi i32 [ %inc, %for.body0], [ 0, %for.body2 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  br label %for.body.tail
+for.body.tail:
+  %exitcond.not = call i1 @get.i1()
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_vec(<2 x i64> %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_vec:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB3_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    callq use.2xi64@PLT
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    psubq %xmm1, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-NEXT:    pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; CHECK-NEXT:    pand %xmm0, %xmm2
+; CHECK-NEXT:    pandn %xmm3, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    psubq %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi <2 x i64> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %rem = urem <2 x i64> %i.04, %rem_amt
+  tail call void @use.2xi64(<2 x i64> %rem)
+  %inc = add nuw <2 x i64> %i.04, <i64 1, i64 1>
+  %exitcond.not = call i1 @get.i1()
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_bad_incr(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_bad_incr:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB4_6
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    jmp .LBB4_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB4_4: # %for.body.tail
+; CHECK-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    movl %r14d, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ebx
+; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    callq use.i32@PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebp, %r14d
+; CHECK-NEXT:    je .LBB4_5
+; CHECK-NEXT:  .LBB4_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq get.i1@PLT
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB4_4
+; CHECK-NEXT:  # %bb.3: # %for.body0
+; CHECK-NEXT:    # in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    callq get.i32@PLT
+; CHECK-NEXT:    movl %eax, %r14d
+; CHECK-NEXT:    jmp .LBB4_4
+; CHECK-NEXT:  .LBB4_5:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB4_6: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp eq i32 %N, 0
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.03 = phi i32 [ %inc, %for.body.tail ], [ 0, %entry ]
+  %cond0 = call i1 @get.i1()
+  br i1 %cond0, label %for.body0, label %for.body.tail
+for.body0:
+  %some_val = call i32 @get.i32()
+  br label %for.body.tail
+
+for.body.tail:
+  %i.04 = phi i32 [ %i.03, %for.body ], [ %some_val, %for.body0 ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_to_sel_second_acc(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_to_sel_second_acc:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    jb .LBB5_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl $1, %r15d
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r13d, %r13d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB5_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq use.i32@PLT
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    cmpl %ebx, %r14d
+; CHECK-NEXT:    cmovel %r12d, %r14d
+; CHECK-NEXT:    incl %r13d
+; CHECK-NEXT:    addl $2, %r15d
+; CHECK-NEXT:    cmpl %ebp, %r15d
+; CHECK-NEXT:    jbe .LBB5_2
+; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:  .LBB5_4: # %for.cond.cleanup
+; CHECK-NEXT:    retq
+entry:
+  %cmp3.not = icmp ult i32 %N, 2
+  br i1 %cmp3.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %i.05 = phi i32 [ %inc2, %for.body ], [ 1, %entry ]
+  %rem = urem i32 %i.04, %rem_amt
+  tail call void @use.i32(i32 %rem)
+  %inc = add nuw i32 %i.04, 1
+  %inc2 = add nuw i32 %i.05, 2
+  %exitcond.not = icmp ugt i32 %inc2, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @simple_urem_fail_srem(i32 %N, i32 %rem_amt) nounwind {
+; CHECK-LABEL: simple_urem_fail_srem:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB6_4
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT...
[truncated]

goldsteinn · 2024-06-25T12:17:59Z

The goal is the extend this in the near future. I was hoping to get feedback on the approach. Mostly is CGP a reasonable place to put this?

AFAICT LLVM doesn't have much infrastructure for generic strength reduction. The other places I considered where:
LoopStrengthReduction and IndVar.

LoopStrengthReduction, however, seems to only be geared towards strength reduction for the sake of address calculation, and IndVar seems geared towards reduction in the number of IVs, not the other way around.

My rationale putting it here are

its late in the pipeline so its arguably less canonical form won't interfere with other folds.
at this point we can be pretty certain the remainder isn't a power of 2 (or at least we won't be able to prove that it is)
this seemed like the only place in LLVM I could find where we both have access to loop constructs / IR but also are willing to essentially further complicate the code (other than vectorization stuff).

If this patch looks good (or at least is headed in the right direction) my plan is to add support for non-zero Start/IncrLoopInvariant and add special support for its usage in an icmp / and (br (icmp...)) as both of those cases show up a lot in code and we can piggy back on the existing icmp/br to optimize the transform result.

dtcxzyw · 2024-06-25T12:27:38Z

Alive2 seemed unable to prove this (see:
https://alive2.llvm.org/ce/z/ATGDp3 which is clearly wrong but still
checks out...)

Can you try again with "--src-unroll=xxx --tgt-unroll=xxx"?

goldsteinn · 2024-06-25T12:56:50Z

Alive2 seemed unable to prove this (see:
https://alive2.llvm.org/ce/z/ATGDp3 which is clearly wrong but still
checks out...)

Can you try again with "--src-unroll=xxx --tgt-unroll=xxx"?

Ah, didn't know about that: https://alive2.llvm.org/ce/z/BsyXdJ (anything past i3/unroll of 4 was taking too long).

Edit: Was able to verify 6x w/ i4 locally. Think given the arbitrary start/incr/end point that should cover it (along w/ the C++ exhaustive tests).

dtcxzyw · 2024-06-25T14:39:40Z

Affected files:

abseil-cpp/optimized/duration_test.cc.ll
abseil-cpp/optimized/examples_test.cc.ll
abseil-cpp/optimized/explicit_seed_seq_test.cc.ll
abseil-cpp/optimized/nonsecure_base_test.cc.ll
abseil-cpp/optimized/pcg_engine_test.cc.ll
abseil-cpp/optimized/randen_engine_test.cc.ll
abseil-cpp/optimized/salted_seed_seq_test.cc.ll
abseil-cpp/optimized/seed_sequences_test.cc.ll
actix-rs/optimized/4i8sqy4dbcgvpe7w.ll
assimp/optimized/clipper.cpp.ll
brotli/optimized/block_splitter.c.ll
cpp-httplib/optimized/httplib.cc.ll
darktable/optimized/ArwDecoder.cpp.ll
darktable/optimized/introspection_colorbalancergb.c.ll
darktable/optimized/introspection_demosaic.c.ll
folly/optimized/Random.cpp.ll
graphviz/optimized/gvrender_core_mp.c.ll
gromacs/optimized/gmx_clustsize.cpp.ll
gromacs/optimized/gmx_xpm2ps.cpp.ll
harfbuzz/optimized/hb-subset.cc.ll
hyperscan/optimized/rose_build_bytecode.cpp.ll
image-rs/optimized/1clnprdgqfw2q9lq.ll
libevent/optimized/evutil_rand.c.ll
meshlab/optimized/ml_scene_gl_shared_data_context.cpp.ll
meshlab/optimized/trackmode.cpp.ll
minetest/optimized/texturesource.cpp.ll
ocio/optimized/MatrixOpData.cpp.ll
oiio/optimized/tiffinput.cpp.ll
php/optimized/string.ll
wasmtime-rs/optimized/4qgt4edt0wnnlcua.ll
wireshark/optimized/packet-pkcs12.c.ll

llvm/lib/CodeGen/CodeGenPrepare.cpp

goldsteinn · 2024-06-26T04:54:30Z

NB: llvm-test-suite passes.

goldsteinn · 2024-07-03T07:33:16Z

ping

llvm/lib/CodeGen/CodeGenPrepare.cpp

dtcxzyw · 2024-07-03T08:47:06Z

llvm/test/CodeGen/X86/fold-loop-of-urem.ll

@@ -0,0 +1,884 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s


It would be better to convert this into an opt test.

Suggested change

; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s

; RUN: opt -codegenprepare -S < %s | FileCheck %s

Need the triple (for CGP), but making it an opt test.

Actually, not sure why, but most of the tests fail w/ opt. Not sure why that is....

Are you using -passes=codegenprepare? IIRC CGP hasn't been ported to new pass manager.

Okay, the issue is that just opt -codegenprepare doesn't actually run the loop analysis passes which this optimization requires. I can get it working if I do something like add -licm. Do you know the arguments to run the loop analysis pass as a pre-req?

llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp

Line 566 in 97a2bd8

LI = &AM.getResult<LoopAnalysis>(F);

IIRC it always requires LI. It is weird.

llc runs LoopSimplify before CodeGenPrepare, which introduces pre-headers for loops in InsertPreheaderForLoop.

See also

llvm-project/llvm/test/CodeGen/RISCV/O3-pipeline.ll

Lines 42 to 73 in f0eb558

; CHECK-NEXT: Canonicalize natural loops

; CHECK-NEXT: Scalar Evolution Analysis

; CHECK-NEXT: Loop Pass Manager

; CHECK-NEXT: Canonicalize Freeze Instructions in Loops

; CHECK-NEXT: Induction Variable Users

; CHECK-NEXT: Loop Strength Reduction

; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)

; CHECK-NEXT: Function Alias Analysis Results

; CHECK-NEXT: Merge contiguous icmps into a memcmp

; CHECK-NEXT: Natural Loop Information

; CHECK-NEXT: Lazy Branch Probability Analysis

; CHECK-NEXT: Lazy Block Frequency Analysis

; CHECK-NEXT: Expand memcmp() to load/stores

; CHECK-NEXT: Lower Garbage Collection Instructions

; CHECK-NEXT: Shadow Stack GC Lowering

; CHECK-NEXT: Lower constant intrinsics

; CHECK-NEXT: Remove unreachable blocks from the CFG

; CHECK-NEXT: Natural Loop Information

; CHECK-NEXT: Post-Dominator Tree Construction

; CHECK-NEXT: Branch Probability Analysis

; CHECK-NEXT: Block Frequency Analysis

; CHECK-NEXT: Constant Hoisting

; CHECK-NEXT: Replace intrinsics with calls to vector library

; CHECK-NEXT: Partially inline calls to library functions

; CHECK-NEXT: Expand vector predication intrinsics

; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)

; CHECK-NEXT: Scalarize Masked Memory Intrinsics

; CHECK-NEXT: Expand reduction intrinsics

; CHECK-NEXT: Natural Loop Information

; CHECK-NEXT: TLS Variable Hoist

; CHECK-NEXT: Type Promotion

; CHECK-NEXT: CodeGen Prepare

Canonicalize natural loops stands for LoopSimplify.

Okay, ill try loopsimplify + codegenprepare passes for the opt tests.

goldsteinn · 2024-07-16T06:49:51Z

ping

dtcxzyw

Should we drop the CodeGen/X86 tests?

llvm/lib/CodeGen/CodeGenPrepare.cpp

goldsteinn · 2024-07-18T11:51:08Z

Should we drop the CodeGen/X86 tests?

I kept them because the pipeline isn't quite the same as what we get w/ just --loop-simplify -cogegenprepare

dtcxzyw

LGTM. Please wait for additional approval from other reviewers :)

goldsteinn · 2024-07-25T06:59:04Z

ping @nikic @RKSimon

nikic

Some initial comments, the AddSub stuff is very confusing to me.

llvm/lib/CodeGen/CodeGenPrepare.cpp

nikic · 2024-07-25T15:10:58Z

llvm/lib/CodeGen/CodeGenPrepare.cpp

+  // Potential TODO(2): Should we have a check for how "nested" this remainder
+  // operation is? The new code runs every iteration so if the remainder is
+  // guarded behind unlikely conditions this might not be worth it.
+  if (AddOrSub.has_value() || match(RemAmt, m_ImmConstant()))


I'm confused. Why does the whole code for determining AddOrSub exist if we just bail out here anyway?

So my initial rationale was isRemOfLoopIncrementWithLoopInvariant is essentially a matcher and we can hypothetically handle the add/sub pattern but it seemed overkill here.

But I guess there is a fair argument its untested so ill extend the impl to handle if we can simplify the pre-header urem.

Please remove all the unnecessary AddSub handling instead. Add it in a followup if you can prove usefulness.

I added test cases for it + enabled it.

A common (motivating) case is:

for(i = 0; i < N; ++i) { if((i + 1) % RemAmt == 0) do_occasional_thing_but_not_on_first_iterations(); // normal loop body }

Saw your comment late, ill drop if you insist, but think it is a valuable part.

Do you insist?

llvm/lib/CodeGen/CodeGenPrepare.cpp

…t value; NFC

``` for(i = Start; i < End; ++i) Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant; ``` -> ``` Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant; for(i = Start; i < End; ++i, ++rem) Rem = rem == RemAmtLoopInvariant ? 0 : Rem; ``` In its current state, only if `IncrLoopInvariant` and `Start` both being zero. Alive2 seemed unable to prove this (see: https://alive2.llvm.org/ce/z/ATGDp3 which is clearly wrong but still checks out...) so wrote an exhaustive test here: https://godbolt.org/z/WYa561388

goldsteinn · 2024-08-11T06:03:00Z

ping

RKSimon

LGTM - @nikic anything else?

nikic

Looks reasonable to me. I did not inspect test coverage in detail.

nikic · 2024-08-16T19:15:06Z

llvm/lib/CodeGen/CodeGenPrepare.cpp

+
+  // Find out loop increment PHI.
+  auto *PN = dyn_cast<PHINode>(Incr);
+


nit: Drop this newline.

nikic · 2024-08-16T19:16:07Z

llvm/lib/CodeGen/CodeGenPrepare.cpp

+
+  // Only trivially analyzable loops.
+  Loop *L = LI->getLoopFor(Rem->getParent());
+  if (!L || !L->getLoopPreheader() || !L->getLoopLatch())


IIRC getLoopPreheader is expensive, so maybe move this after the phi check.

nikic · 2024-08-16T19:19:21Z

llvm/lib/CodeGen/CodeGenPrepare.cpp

+
+  // getIVIncrement finds the loop at PN->getParent(). This might be a different
+  // loop from the loop with Rem->getParent().
+  if (L->getHeader() != PN->getParent())


I think technically the right thing to do would be to have L be the loop of the phi node, not the rem, and only check that L also contains rem (which is not guaranteed as we don't require LCSSA in CGP). The loop of the phi node is really what you care about, the urem itself can be in a nested loop without issue (I think?)

But I agree it's best to ignore this case for now.

chapuni · 2024-08-19T02:31:23Z

@goldsteinn Looks like c64ce8b might break aarch64 stage2. Guessing miscompilation.

goldsteinn · 2024-08-19T03:35:59Z

@goldsteinn Looks like c64ce8b might break aarch64 stage2. Guessing miscompilation.

Yeah I see, I am trying to repro locally. I'll revert for now as not sure of the exact issue.

goldsteinn · 2024-08-19T03:37:57Z

@goldsteinn Looks like c64ce8b might break aarch64 stage2. Guessing miscompilation.

Yeah I see, I am trying to repro locally. I'll revert for now as not sure of the exact issue.

Reverted: 731ae69

goldsteinn · 2024-08-19T16:47:27Z

Im pretty sure I see the bug.

The issue is that while Rem might be a loop-invariant in its loop, it might not be a loop invariant in the PHI loop. The fix is what nikic recommended, to just get the loop from the PHI instead of the Rem.

goldsteinn · 2024-08-19T17:10:36Z

No bug was different:

-  Builder.SetInsertPoint(LoopIncrPN);
+  Builder.SetInsertPoint(Rem);

If the loop was multiple BBs we would be inserting rem in wrong locations...

goldsteinn · 2024-08-19T17:32:16Z

Actually still not 100% sure, still havent built a repro.

goldsteinn · 2024-08-19T19:16:33Z

Okay fairly certain I found the bug. Reproduced (ish) with a cross build and think the issue is the Start > RemAmt case wasn't being correctly handled. Fix is:

  Start = simplifyURemInst(Start, RemAmt, *DL);
  if (!Start)
    return false;

llvmbot added the backend:X86 label Jun 25, 2024

goldsteinn changed the title ~~goldsteinn/cgp urem of liv~~ [CodeGenPrepare] Folding urem with loop invariant value as remainder Jun 25, 2024

goldsteinn requested review from nikic, dtcxzyw and RKSimon June 25, 2024 12:18

dtcxzyw reviewed Jun 25, 2024

View reviewed changes

llvm/lib/CodeGen/CodeGenPrepare.cpp Outdated Show resolved Hide resolved

goldsteinn force-pushed the goldsteinn/cgp-urem-of-liv branch from 616d93c to c31106b Compare June 26, 2024 03:20

dtcxzyw reviewed Jul 3, 2024

View reviewed changes

goldsteinn force-pushed the goldsteinn/cgp-urem-of-liv branch from c31106b to 446e104 Compare July 3, 2024 16:03

goldsteinn force-pushed the goldsteinn/cgp-urem-of-liv branch from 446e104 to 3092c9e Compare July 17, 2024 14:21

llvmbot added the llvm:transforms label Jul 17, 2024

dtcxzyw reviewed Jul 18, 2024

View reviewed changes

llvm/lib/CodeGen/CodeGenPrepare.cpp Outdated Show resolved Hide resolved

llvm/lib/CodeGen/CodeGenPrepare.cpp Outdated Show resolved Hide resolved

llvm/lib/CodeGen/CodeGenPrepare.cpp Outdated Show resolved Hide resolved

llvm/lib/CodeGen/CodeGenPrepare.cpp Outdated Show resolved Hide resolved

dtcxzyw approved these changes Jul 18, 2024

View reviewed changes

nikic reviewed Jul 25, 2024

View reviewed changes

goldsteinn force-pushed the goldsteinn/cgp-urem-of-liv branch from f766385 to e88b6f1 Compare July 25, 2024 18:17

goldsteinn added 5 commits July 30, 2024 16:57

[CodeGenPrepare][X86] Add tests for folding urem with loop invarian…

5c1a068

…t value; NFC

more better comments

73c323c

handle add sub

c32f271

fixup

ef331b6

Remove add/sub handling

9b88f56

goldsteinn force-pushed the goldsteinn/cgp-urem-of-liv branch from e88b6f1 to 9b88f56 Compare July 30, 2024 09:03

remove old comment

ff00073

RKSimon approved these changes Aug 16, 2024

View reviewed changes

nikic reviewed Aug 16, 2024

View reviewed changes

nikic approved these changes Aug 16, 2024

View reviewed changes

goldsteinn closed this in c64ce8b Aug 18, 2024

goldsteinn reopened this Aug 19, 2024

goldsteinn mentioned this pull request Aug 19, 2024

Recommit "[CodeGenPrepare] Folding urem with loop invariant value" #104877

Closed

goldsteinn closed this Aug 21, 2024

		@@ -0,0 +1,884 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
		; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s

	; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s
	; RUN: opt -codegenprepare -S < %s \| FileCheck %s

	; CHECK-NEXT: Canonicalize natural loops
	; CHECK-NEXT: Scalar Evolution Analysis
	; CHECK-NEXT: Loop Pass Manager
	; CHECK-NEXT: Canonicalize Freeze Instructions in Loops
	; CHECK-NEXT: Induction Variable Users
	; CHECK-NEXT: Loop Strength Reduction
	; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
	; CHECK-NEXT: Function Alias Analysis Results
	; CHECK-NEXT: Merge contiguous icmps into a memcmp
	; CHECK-NEXT: Natural Loop Information
	; CHECK-NEXT: Lazy Branch Probability Analysis
	; CHECK-NEXT: Lazy Block Frequency Analysis
	; CHECK-NEXT: Expand memcmp() to load/stores
	; CHECK-NEXT: Lower Garbage Collection Instructions
	; CHECK-NEXT: Shadow Stack GC Lowering
	; CHECK-NEXT: Lower constant intrinsics
	; CHECK-NEXT: Remove unreachable blocks from the CFG
	; CHECK-NEXT: Natural Loop Information
	; CHECK-NEXT: Post-Dominator Tree Construction
	; CHECK-NEXT: Branch Probability Analysis
	; CHECK-NEXT: Block Frequency Analysis
	; CHECK-NEXT: Constant Hoisting
	; CHECK-NEXT: Replace intrinsics with calls to vector library
	; CHECK-NEXT: Partially inline calls to library functions
	; CHECK-NEXT: Expand vector predication intrinsics
	; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
	; CHECK-NEXT: Scalarize Masked Memory Intrinsics
	; CHECK-NEXT: Expand reduction intrinsics
	; CHECK-NEXT: Natural Loop Information
	; CHECK-NEXT: TLS Variable Hoist
	; CHECK-NEXT: Type Promotion
	; CHECK-NEXT: CodeGen Prepare


		// Find out loop increment PHI.
		auto *PN = dyn_cast<PHINode>(Incr);

[CodeGenPrepare] Folding urem with loop invariant value as remainder #96625

[CodeGenPrepare] Folding urem with loop invariant value as remainder #96625

Uh oh!

Conversation

goldsteinn commented Jun 25, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jun 25, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

goldsteinn commented Jun 25, 2024

Uh oh!

dtcxzyw commented Jun 25, 2024

Uh oh!

goldsteinn commented Jun 25, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

dtcxzyw commented Jun 25, 2024

Uh oh!

Uh oh!

goldsteinn commented Jun 26, 2024

Uh oh!

goldsteinn commented Jul 3, 2024

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

goldsteinn commented Jul 16, 2024

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

goldsteinn commented Jul 18, 2024

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

goldsteinn commented Jul 25, 2024

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

[CodeGenPrepare] Folding `urem` with loop invariant value as remainder #96625

[CodeGenPrepare] Folding `urem` with loop invariant value as remainder #96625

goldsteinn commented Jun 25, 2024 •

edited

Loading

llvmbot commented Jun 25, 2024 •

edited

Loading

goldsteinn commented Jun 25, 2024 •

edited

Loading

nikic Aug 16, 2024 •

edited

Loading

nikic Aug 16, 2024 •

edited

Loading