[X86,SimplifyCFG] Support hoisting load/store with conditional faulting (Part II) #108812

phoebewang · 2024-09-16T10:10:46Z

This is a follow up of #96878 to support hoisting load/store from BBs have the same predecessor, if load/store are the only instructions and the branch is unpredictable, e.g.:

void test (int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

llvmbot · 2024-09-16T10:11:19Z

@llvm/pr-subscribers-llvm-transforms

Author: Phoebe Wang (phoebewang)

Changes

This is a follow up of #96878 to support hoisting load/store for diamond CFG.

void test (int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

Full diff: https://github.com/llvm/llvm-project/pull/108812.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Utils/SimplifyCFG.cpp (+82-25)
(modified) llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll (+36-10)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f9db996cdc3583..420c59789813c1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -283,7 +283,7 @@ class SimplifyCFGOpt {
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
 
-  bool hoistCommonCodeFromSuccessors(BasicBlock *BB, bool EqTermsOnly);
+  bool hoistCommonCodeFromSuccessors(Instruction *TI, bool EqTermsOnly);
   bool hoistSuccIdenticalTerminatorToSwitchOrIf(
       Instruction *TI, Instruction *I1,
       SmallVectorImpl<Instruction *> &OtherSuccTIs);
@@ -1615,12 +1615,31 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1,
   return false;
 }
 
+static bool isSafeCheapLoadStore(const Instruction *I,
+                                 const TargetTransformInfo &TTI) {
+  // Not handle volatile or atomic.
+  if (auto *L = dyn_cast<LoadInst>(I)) {
+    if (!L->isSimple())
+      return false;
+  } else if (auto *S = dyn_cast<StoreInst>(I)) {
+    if (!S->isSimple())
+      return false;
+  } else
+    return false;
+
+  // llvm.masked.load/store use i32 for alignment while load/store use i64.
+  // That's why we have the alignment limitation.
+  // FIXME: Update the prototype of the intrinsics?
+  return TTI.hasConditionalLoadStoreForType(getLoadStoreType(I)) &&
+         getLoadStoreAlignment(I) < Value::MaximumAlignment;
+}
+
 /// Hoist any common code in the successor blocks up into the block. This
 /// function guarantees that BB dominates all successors. If EqTermsOnly is
 /// given, only perform hoisting in case both blocks only contain a terminator.
 /// In that case, only the original BI will be replaced and selects for PHIs are
 /// added.
-bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
+bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
                                                    bool EqTermsOnly) {
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks. In particular, we don't want to get into
@@ -1628,6 +1647,7 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
   // such, we currently just scan for obviously identical instructions in an
   // identical order, possibly separated by the same number of non-identical
   // instructions.
+  BasicBlock *BB = TI->getParent();
   unsigned int SuccSize = succ_size(BB);
   if (SuccSize < 2)
     return false;
@@ -1639,7 +1659,63 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
     if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
       return false;
 
-  auto *TI = BB->getTerminator();
+  auto *BI = dyn_cast<BranchInst>(TI);
+  if (BI && HoistLoadsStoresWithCondFaulting &&
+      Options.HoistLoadsStoresWithCondFaulting) {
+    SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
+    for (auto *Succ : successors(BB)) {
+      for (Instruction &I : drop_end(*Succ)) {
+        if (!isSafeCheapLoadStore(&I, TTI) ||
+            SpeculatedConditionalLoadsStores.size() ==
+                HoistLoadsStoresWithCondFaultingThreshold)
+          return false;
+        SpeculatedConditionalLoadsStores.push_back(&I);
+      }
+    }
+
+    // TODO: Move below code to a function to share with #96878.
+    if (SpeculatedConditionalLoadsStores.empty())
+      return false;
+
+    auto &Context = BI->getParent()->getContext();
+    auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+    auto *Cond = BI->getOperand(0);
+    IRBuilder<> Builder(BI);
+    Value *Mask1 = Builder.CreateBitCast(Cond, VCondTy);
+    Value *Mask0 = Builder.CreateBitCast(
+        Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+    for (auto *I : SpeculatedConditionalLoadsStores) {
+      Value *Mask = I->getParent() == BI->getSuccessor(0) ? Mask1 : Mask0;
+      assert(!getLoadStoreType(I)->isVectorTy() && "not implemented");
+      auto *Op0 = I->getOperand(0);
+      Instruction *MaskedLoadStore = nullptr;
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        // Handle Load.
+        auto *Ty = I->getType();
+        MaskedLoadStore = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1),
+                                                   Op0, LI->getAlign(), Mask);
+        I->replaceAllUsesWith(Builder.CreateBitCast(MaskedLoadStore, Ty));
+      } else {
+        // Handle Store.
+        auto *StoredVal =
+            Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+        MaskedLoadStore = Builder.CreateMaskedStore(
+            StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+      }
+      I->dropUBImplyingAttrsAndUnknownMetadata(
+          {LLVMContext::MD_range, LLVMContext::MD_annotation});
+      // FIXME: DIAssignID is not supported for masked store yet.
+      // (Verifier::visitDIAssignIDMetadata)
+      at::deleteAssignmentMarkers(I);
+      I->eraseMetadataIf([](unsigned MDKind, MDNode *Node) {
+        return Node->getMetadataID() == Metadata::DIAssignIDKind;
+      });
+      MaskedLoadStore->copyMetadata(*I);
+      I->eraseFromParent();
+    }
+
+    return true;
+  }
 
   // The second of pair is a SkipFlags bitmask.
   using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
@@ -2998,25 +3074,6 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
   return BIEndProb < Likely;
 }
 
-static bool isSafeCheapLoadStore(const Instruction *I,
-                                 const TargetTransformInfo &TTI) {
-  // Not handle volatile or atomic.
-  if (auto *L = dyn_cast<LoadInst>(I)) {
-    if (!L->isSimple())
-      return false;
-  } else if (auto *S = dyn_cast<StoreInst>(I)) {
-    if (!S->isSimple())
-      return false;
-  } else
-    return false;
-
-  // llvm.masked.load/store use i32 for alignment while load/store use i64.
-  // That's why we have the alignment limitation.
-  // FIXME: Update the prototype of the intrinsics?
-  return TTI.hasConditionalLoadStoreForType(getLoadStoreType(I)) &&
-         getLoadStoreAlignment(I) < Value::MaximumAlignment;
-}
-
 /// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -7436,7 +7493,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
     return requestResimplify();
 
   if (HoistCommon &&
-      hoistCommonCodeFromSuccessors(SI->getParent(), !Options.HoistCommonInsts))
+      hoistCommonCodeFromSuccessors(SI, !Options.HoistCommonInsts))
     return requestResimplify();
 
   return false;
@@ -7794,8 +7851,8 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
-      if (HoistCommon && hoistCommonCodeFromSuccessors(
-                             BI->getParent(), !Options.HoistCommonInsts))
+      if (HoistCommon &&
+          hoistCommonCodeFromSuccessors(BI, !Options.HoistCommonInsts))
         return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
index 047ca717da8009..87ff4ba2af9a41 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -278,21 +278,19 @@ if.false:                                    ; preds = %if.true, %entry
 }
 
 ;; Both of successor 0 and successor 1 have a single predecessor.
-;; TODO: Support transform for this case.
 define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
 ; CHECK-LABEL: @single_predecessor(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       common.ret:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
-; CHECK:       if.end:
-; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET]]
 ;
 entry:
   %tobool = icmp ne i32 %a, 0
@@ -674,6 +672,34 @@ if.false:
   ret void
 }
 
+define void @diamondCFG(i32 %a, ptr %c, ptr %d) {
+; CHECK-LABEL: @diamondCFG(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[TOBOOL_NOT]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[TOBOOL_NOT]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[D:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[A]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[C:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tobool.not = icmp eq i32 %a, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 %a, ptr %c, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 0, ptr %d, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
 declare i32 @read_memory_only() readonly nounwind willreturn speculatable
 
 !llvm.dbg.cu = !{!0}

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

…ng (Part II)

KanRobert · 2024-09-23T04:42:47Z

llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll

@@ -728,6 +726,34 @@ if.true:
  ret i32 %res
 }

+define void @diamondCFG(i32 %a, ptr %c, ptr %d) {


Probably remove this? This function tests the same thing as single_predecessor

KanRobert · 2024-09-23T04:45:17Z

llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll

+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])


We can use TMP3 here. Two bitcasts looks redundant.

KanRobert · 2024-09-23T04:53:29Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

+        *Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
+        VCondTy);
+  } else {
+    Mask0 = Builder.CreateBitCast(


Considering we name the BB as TrueBB and FalseBB in the comment, maybe
Mask0 -> MaskFalse
Mask1 -> MaskTrue

KanRobert · 2024-09-23T04:58:59Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

  for (auto *I : SpeculatedConditionalLoadsStores) {
-    IRBuilder<> Builder(I);
+    IRBuilder<> Builder(Invert ? I : BI);
+    if (!Invert)


if (!Mask) looks better?

Oh, we cannot. The Mask is not a nullptr in the second iteration.

KanRobert · 2024-09-23T05:00:37Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

-  Value *Mask = Builder.CreateBitCast(
-      Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
-      VCondTy);
+  IRBuilder<> Builder(Invert ? SpeculatedConditionalLoadsStores.back() : BI);


It seems we don't need to select the insertion point by condition. Always use BI? Same for line 1686

No, cannot. Because in the triangle case, the SpeculatedConditionalLoadsStores was pushed in reverse order, we must get the insertion point one by one for it, otherwise, we will get something like

define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) #0 { entry: %0 = bitcast i1 %cond to <1 x i1> %1 = bitcast i64 %5 to <1 x i64> call void @llvm.masked.store.v1i64.p0(<1 x i64> %1, ptr %q, i32 8, <1 x i1> %0) %2 = bitcast i32 %7 to <1 x i32> call void @llvm.masked.store.v1i32.p0(<1 x i32> %2, ptr %p, i32 4, <1 x i1> %0) %3 = bitcast i16 %9 to <1 x i16> call void @llvm.masked.store.v1i16.p0(<1 x i16> %3, ptr %b, i32 2, <1 x i1> %0) %4 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %b, i32 8, <1 x i1> %0, <1 x i64> poison) %5 = bitcast <1 x i64> %4 to i64 %6 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %q, i32 4, <1 x i1> %0, <1 x i32> poison) %7 = bitcast <1 x i32> %6 to i32 %8 = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr %p, i32 2, <1 x i1> %0, <1 x i16> poison) %9 = bitcast <1 x i16> %8 to i16 ret void }

Given its an optional<bool> - would it be easier to grok if you used Invert.has_value() ?

KanRobert · 2024-09-23T05:08:51Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

+      }
+    }
+
+    if (!SpeculatedConditionalLoadsStores.empty())


Looks suspicious here since the following checks are skipped. We should add tests to check what happen if there is non-load/store instructions in the successors.

The assumption here is prior passes have moved common instructions out of branches. It works in a pipeline, e.g.,

$ cat single_predecessor.ll define i32 @single_predecessor(ptr %p, ptr %q, i32 %x, i32 %a, i32 %b) { entry: %tobool = icmp ne i32 %x, 0 br i1 %tobool, label %if.end, label %if.then if.end: store i32 1, ptr %q %c = add i32 %a, %b ; <== common instruction ret i32 %c if.then: %0 = load i32, ptr %q store i32 %0, ptr %p %d = add i32 %a, %b ; <== common instruction ret i32 %d } $ opt -passes=simplifycfg,'instcombine<no-verify-fixpoint>','simplifycfg<hoist-loads-stores-with-cond-faulting>' -mtriple=x86_64 -mattr=+cf single_predecessor.ll -S -o - ; ModuleID = 'single_predecessor.ll' source_filename = "single_predecessor.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64" define i32 @single_predecessor(ptr %p, ptr %q, i32 %x, i32 %a, i32 %b) #0 { entry: %tobool.not = icmp eq i32 %x, 0 %0 = xor i1 %tobool.not, true %1 = bitcast i1 %0 to <1 x i1> %2 = bitcast i1 %tobool.not to <1 x i1> %3 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %q, i32 4, <1 x i1> %2, <1 x i32> poison) %4 = bitcast <1 x i32> %3 to i32 call void @llvm.masked.store.v1i32.p0(<1 x i32> %3, ptr %p, i32 4, <1 x i1> %2) call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr %q, i32 4, <1 x i1> %1) %common.ret.op = add i32 %a, %b ret i32 %common.ret.op }

Considering current implementation separates the common instr hoist from load/store hoist,
probably we should move the code to

if (BI->getSuccessor(0)->getSinglePredecessor()) { if (BI->getSuccessor(1)->getSinglePredecessor()) { if (HoistCommon && hoistCommonCodeFromSuccessors( BI->getParent(), !Options.HoistCommonInsts)) return requestResimplify(); if (HoistLoadsStoresWithCondFaulting && Options.HoistLoadsStoresWithCondFaulting && hoistConditionalLoadsStores(...)) return requestResimplify();

I don't think it's necessary given the last simplifycfg is supposed to clean up single-entry-single-exit or empty blocks.
https://github.com/llvm/llvm-project/blob/main/llvm/lib/Passes/PassBuilderPipelines.cpp#L1533-L1534

If we have such pattens, we should have optimized in previous simplifycfgs:

$ clang -S apx.c -mapxf -mllvm -print-pipeline-passes -O1 | sed -e 's/,/\n/g' | sed -e 's/;no-[^;>]*//g' | grep ^simplifycfg simplifycfg<bonus-inst-threshold=1;keep-loops;speculate-blocks;simplify-cond-branch> simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>) simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch> simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch> simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch> simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch> simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch> simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;hoist-loads-stores-with-cond-faulting;speculate-blocks;simplify-cond-branch;speculate-unpredictables>)

I didn't mean to clean the empty bb by ourselves. I meant the logic of hoisting load/store does not match the name and comment of hositCommonCodeFromSuccessors. Probably we should move it out.

Makes sense, done.

KanRobert · 2024-09-23T14:05:57Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

+
+    if (!SpeculatedConditionalLoadsStores.empty())
+      hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores,
+                                  std::nullopt);


Should we return here?

Yes, good catch!

phoebewang · 2024-09-25T07:51:27Z

@KanRobert , with the latest change, the icount and cond_br of all cpu2017 benchmarks are shown as below:

Patch	icount	cond_br
Without	5.747	0.769
With	5.71	0.797

Does the value seem reasonable. There's no failures in both cases.

github-actions · 2024-09-25T07:55:15Z

✅ With the latest revision this PR passed the C/C++ code formatter.

phoebewang · 2024-09-25T07:59:36Z

E2E tests:

$ cat apx.c
void test1 (int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}
int test2 (int a, int *c, int *d) {
  if (a) {
   *c = a;
   return 2;
  } else {
   *d = a;
   return 3;
  }
}
$ clang -O2 -mapxf apx.c -S -o -
        .text
        .file   "apx.c"
        .globl  test1                           # -- Begin function test1
        .p2align        4, 0x90
        .type   test1,@function
test1:                                  # @test1
        .cfi_startproc
# %bb.0:                                # %entry
        xorl    %eax, %eax
        testl   %edi, %edi
        cfcmovel        %eax, (%rdx)
        cfcmovnel       %edi, (%rsi)
        retq
.Lfunc_end0:
        .size   test1, .Lfunc_end0-test1
        .cfi_endproc
                                        # -- End function
        .globl  test2                           # -- Begin function test2
        .p2align        4, 0x90
        .type   test2,@function
test2:                                  # @test2
        .cfi_startproc
# %bb.0:                                # %entry
        xorl    %ecx, %ecx
        testl   %edi, %edi
        setzue  %al
        cfcmovel        %ecx, (%rdx)
        cfcmovnel       %edi, (%rsi)
        orl     $2, %eax
        retq
.Lfunc_end1:
        .size   test2, .Lfunc_end1-test2
        .cfi_endproc
                                        # -- End function

KanRobert · 2024-09-26T14:02:45Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

+                if (I.getNumSuccessors() > 1)
+                  return false;


Add test case for this?

KanRobert · 2024-09-26T14:14:27Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

+                         SpeculatedConditionalLoadsStores.size() ==
+                             HoistLoadsStoresWithCondFaultingThreshold) {


I think we should consider branch probability for this, e.g. isProfitableToSpeculate. If A has two successors B and C, it's not profitable to execute more instructions to eliminate the branch if the branch is well-predicated and the load/store comes from the unlikely successor.

Good idea, done!

KanRobert · 2024-09-26T14:16:13Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

@@ -1725,6 +1744,7 @@ static void hoistConditionalLoadsStores(
    MaskedLoadStore->copyMetadata(*I);
    I->eraseFromParent();
  }
+  return true;


I think this function always return true, should we change it to void?

KanRobert · 2024-09-26T14:17:36Z

This is a follow up of #96878 to support hoisting load/store for diamond CFG.
void test (int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

Need to update the description, it does not require diamond CFG.

phoebewang · 2024-09-27T08:28:41Z

This is a follow up of #96878 to support hoisting load/store for diamond CFG.
void test (int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}
Need to update the description, it does not require diamond CFG.

Done.

phoebewang · 2024-09-27T12:05:04Z

icount and cond_br of all cpu2017 benchmarks with latest change.

Patch	icount	cond_br
Without	5.747	0.769
With	5.707	0.8

RKSimon · 2024-09-27T12:49:55Z

For cases like this why aren't we folding to:

void src(int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

void tgt(int a, int *c, int *d) {
  int *p = a ? c : d;
  *p = a;
}

or the even more general:

void src(int s, int a, int b, int *c, int *d) {
  if (s)
   *c = a;
  else
   *d = b;
}

void tgt(int s, int a, int b, int *c, int *d) {
  int *p = s ? c : d;
  int v = s ? a : b;
  *p = v;
}

phoebewang · 2024-09-27T13:39:10Z

For cases like this why aren't we folding to:

void src(int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

void tgt(int a, int *c, int *d) {
  int *p = a ? c : d;
  *p = a;
}

or the even more general:

void src(int s, int a, int b, int *c, int *d) {
  if (s)
   *c = a;
  else
   *d = b;
}

void tgt(int s, int a, int b, int *c, int *d) {
  int *p = s ? c : d;
  int v = s ? a : b;
  *p = v;
}

That's a good question. It doesn't rely on hardware feature, so can be a general branch to select transformation. I'm also wondering why didn't we do it before, especially the first one. Is there any concern on possible sideeffect of memory operand?

phoebewang · 2024-09-27T13:43:07Z

Seems no compiler choose to do so: https://godbolt.org/z/7ahEcd6n8

KanRobert · 2024-09-27T15:40:25Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

+
+      if (BI && HoistLoadsStoresWithCondFaulting &&
+          Options.HoistLoadsStoresWithCondFaulting &&
+          isProfitableToSpeculate(BI, std::nullopt, TTI)) {


Hmm, from you code, it seems the hoist can happen only when TWeight = FWeight = 0. ?

Yeah, we need a meaningful ratio here, but we haven't enabled PGO. So let's leave it when we do PGO tuning.

KanRobert · 2024-09-27T15:49:16Z

For cases like this why aren't we folding to:
void src(int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

void tgt(int a, int *c, int *d) {
  int *p = a ? c : d;
  *p = a;
}
or the even more general:
void src(int s, int a, int b, int *c, int *d) {
  if (s)
   *c = a;
  else
   *d = b;
}

void tgt(int s, int a, int b, int *c, int *d) {
  int *p = s ? c : d;
  int v = s ? a : b;
  *p = v;
}
That's a good question. It doesn't rely on hardware feature, so can be a general branch to select transformation. I'm also wondering why didn't we do it before, especially the first one. Is there any concern on possible sideeffect of memory operand?

From this perspective, should we implement this optimization w/o any HW feature and do further optimization w/ CFCMOV at backend?

RKSimon · 2024-09-27T15:57:57Z

It seems to be easier to create the masked/conditional store pattern from the store+select pattern rather than the other way around.

phoebewang · 2024-09-28T13:39:18Z

For cases like this why aren't we folding to:
void src(int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

void tgt(int a, int *c, int *d) {
  int *p = a ? c : d;
  *p = a;
}
or the even more general:
void src(int s, int a, int b, int *c, int *d) {
  if (s)
   *c = a;
  else
   *d = b;
}

void tgt(int s, int a, int b, int *c, int *d) {
  int *p = s ? c : d;
  int v = s ? a : b;
  *p = v;
}
That's a good question. It doesn't rely on hardware feature, so can be a general branch to select transformation. I'm also wondering why didn't we do it before, especially the first one. Is there any concern on possible sideeffect of memory operand?
From this perspective, should we implement this optimization w/o any HW feature and do further optimization w/ CFCMOV at backend?

Actually, we have already had this optimization done by SimplifyCFG, we just need to increase NumPHIInsts from 1 to 2:

$ git diff
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 69c4475a494c..488cc18f07e3 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2360,7 +2360,7 @@ static bool sinkCommonCodeFromPredecessors(BasicBlock *BB,
         }
       }
       LLVM_DEBUG(dbgs() << "SINK: #phi insts: " << NumPHIInsts << "\n");
-      return NumPHIInsts <= 1;
+      return NumPHIInsts <= 2;
     };

     // We've determined that we are going to sink last ScanIdx instructions,

$ cat tmp.c
void foo(int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

void bar(int s, int a, int b, int *c, int *d) {
  if (s)
   *c = a;
  else
   *d = b;
}

$ clang -S tmp.c -O2 -o - | grep -E ':|^\s[a-z]'
foo:                                    # @foo
# %bb.0:                                # %entry
        testl   %edi, %edi
        cmoveq  %rdx, %rsi
        movl    %edi, (%rsi)
        retq
.Lfunc_end0:
bar:                                    # @bar
# %bb.0:                                # %entry
        testl   %edi, %edi
        cmoveq  %r8, %rcx
        cmovel  %edx, %esi
        movl    %esi, (%rcx)
        retq

KanRobert · 2024-09-29T03:44:45Z

For cases like this why aren't we folding to:
void src(int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

void tgt(int a, int *c, int *d) {
  int *p = a ? c : d;
  *p = a;
}
or the even more general:
void src(int s, int a, int b, int *c, int *d) {
  if (s)
   *c = a;
  else
   *d = b;
}

void tgt(int s, int a, int b, int *c, int *d) {
  int *p = s ? c : d;
  int v = s ? a : b;
  *p = v;
}
That's a good question. It doesn't rely on hardware feature, so can be a general branch to select transformation. I'm also wondering why didn't we do it before, especially the first one. Is there any concern on possible sideeffect of memory operand?
From this perspective, should we implement this optimization w/o any HW feature and do further optimization w/ CFCMOV at backend?

Actually, we have already had this optimization done by SimplifyCFG, we just need to increase NumPHIInsts from 1 to 2:

$ git diff
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 69c4475a494c..488cc18f07e3 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2360,7 +2360,7 @@ static bool sinkCommonCodeFromPredecessors(BasicBlock *BB,
         }
       }
       LLVM_DEBUG(dbgs() << "SINK: #phi insts: " << NumPHIInsts << "\n");
-      return NumPHIInsts <= 1;
+      return NumPHIInsts <= 2;
     };

     // We've determined that we are going to sink last ScanIdx instructions,

$ cat tmp.c
void foo(int a, int *c, int *d) {
  if (a)
   *c = a;
  else
   *d = a;
}

void bar(int s, int a, int b, int *c, int *d) {
  if (s)
   *c = a;
  else
   *d = b;
}

$ clang -S tmp.c -O2 -o - | grep -E ':|^\s[a-z]'
foo:                                    # @foo
# %bb.0:                                # %entry
        testl   %edi, %edi
        cmoveq  %rdx, %rsi
        movl    %edi, (%rsi)
        retq
.Lfunc_end0:
bar:                                    # @bar
# %bb.0:                                # %entry
        testl   %edi, %edi
        cmoveq  %r8, %rcx
        cmovel  %edx, %esi
        movl    %esi, (%rcx)
        retq

Good knowledge! Then it seems this optimization for this case w/ CFCMOV does not have any value...
We just need to add a tuning knob for NumPHIInsts <= ProfitableToSinkInstructionThreshold, from my perspective.

phoebewang · 2024-09-29T09:23:58Z

Good knowledge! Then it seems this optimization for this case w/ CFCMOV does not have any value...

No, they are different optimizations. They just have intersection in the specific example. The existing one do nothing with the test single_predecessor.

We just need to add a tuning knob for NumPHIInsts <= ProfitableToSinkInstructionThreshold, from my perspective.

Yes, we can do that. I'll create one then :)

phoebewang · 2024-09-29T10:46:30Z

We just need to add a tuning knob for NumPHIInsts <= ProfitableToSinkInstructionThreshold, from my perspective.

#110420

phoebewang · 2024-11-17T15:27:46Z

Friendly ping

phoebewang · 2024-11-25T01:16:27Z

Friendly ping

KanRobert · 2024-11-25T02:11:54Z

llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll

@@ -759,6 +757,43 @@ if.true:
  ret i32 %res
 }

+define i32 @multi_successors(i1 %c1, i32 %c2, ptr %p) {


Is this a negative test? If so, the function name should start with "not", and need some comments. e.g. "not_maximum_alignment" in this file.

KanRobert · 2024-11-25T02:34:08Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

+          return !SpeculatedConditionalLoadsStores.empty();
+        };
+
+        if (CanSpeculateConditionalLoadsStores()) {


It seems the lambda is used once, maybe

bool CanSpeculateConditionalLoadsStores = <your lambda>();

looks better?

The advantage to use lambda is we can direct break inner loop by return. We have to use goto or more flags if change to non lambda code.

KanRobert · 2024-11-25T02:37:12Z

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

@@ -1664,18 +1664,35 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1,
 static void hoistConditionalLoadsStores(
    BranchInst *BI,
    SmallVectorImpl<Instruction *> &SpeculatedConditionalLoadsStores,
-    bool Invert) {
+    std::optional<bool> Invert) {


Add a comment like

\param Invert ...

?

It's a little hard to know when it's nullopt w/o searching for the caller.

phoebewang requested review from RKSimon, dtcxzyw and KanRobert September 16, 2024 10:10

llvmbot added the llvm:transforms label Sep 16, 2024

RKSimon reviewed Sep 17, 2024

View reviewed changes

llvm/lib/Transforms/Utils/SimplifyCFG.cpp Outdated Show resolved Hide resolved

phoebewang added a commit to phoebewang/llvm-project that referenced this pull request Sep 20, 2024

[X86,SimplifyCFG][NFC] Refactor code for llvm#108812

978bf0f

phoebewang added a commit that referenced this pull request Sep 21, 2024

[X86,SimplifyCFG][NFC] Refactor code for #108812 (#109398)

c9e5c42

phoebewang force-pushed the Diamond branch from 3a4796c to 2ec50bb Compare September 21, 2024 12:28

[X86,SimplifyCFG] Support hoisting load/store with conditional faulti…

1077be1

…ng (Part II)

phoebewang force-pushed the Diamond branch from 2ec50bb to 1077be1 Compare September 21, 2024 12:37

KanRobert reviewed Sep 23, 2024

View reviewed changes

phoebewang added 2 commits September 23, 2024 17:45

Address review comments

9d73fdd

Revert one change

15e25fb

KanRobert reviewed Sep 23, 2024

View reviewed changes

phoebewang added 2 commits September 23, 2024 22:25

Early return after hoist load/store

40e2130

Limit TrueBB and False BB has no more than 1 successor

d6d50c4

clang-format

de28ed9

phoebewang added 2 commits September 26, 2024 17:24

Use Invert.has_value()

8ec9409

Address review comment

fc7df9a

KanRobert reviewed Sep 26, 2024

View reviewed changes

Address review comments

dfe6cc6

Merge remote-tracking branch 'origin/main' into Diamond

1ad8714

KanRobert reviewed Sep 27, 2024

View reviewed changes

Merge remote-tracking branch 'origin/main' into Diamond

5a5de39

KanRobert reviewed Nov 25, 2024

View reviewed changes

Address review comments

b1bda56

KanRobert approved these changes Nov 25, 2024

View reviewed changes

phoebewang merged commit 2568e52 into llvm:main Nov 25, 2024
8 checks passed

phoebewang deleted the Diamond branch November 25, 2024 07:19

		SpeculatedConditionalLoadsStores.size() ==
		HoistLoadsStoresWithCondFaultingThreshold) {

[X86,SimplifyCFG] Support hoisting load/store with conditional faulting (Part II) #108812

[X86,SimplifyCFG] Support hoisting load/store with conditional faulting (Part II) #108812

Uh oh!

Conversation

phoebewang commented Sep 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 16, 2024

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

phoebewang Sep 23, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

phoebewang commented Sep 25, 2024

Uh oh!

github-actions bot commented Sep 25, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

phoebewang commented Sep 25, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

KanRobert commented Sep 26, 2024

Uh oh!

phoebewang commented Sep 27, 2024

Uh oh!

phoebewang commented Sep 27, 2024

Uh oh!

RKSimon commented Sep 27, 2024

phoebewang commented Sep 16, 2024 •

edited

Loading

phoebewang Sep 23, 2024 •

edited

Loading

github-actions bot commented Sep 25, 2024 •

edited

Loading

phoebewang commented Sep 28, 2024 •

edited

Loading