Merge commit 'd448848ca46d' from apple/stable/20210107 into swift/main

git apple-llvm automerger · git apple-llvm automerger · commit c64e53b8742e · 2021-04-13T22:40:39.000+01:00
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -247,7 +247,8 @@ class SimplifyCFGOpt {
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
 
-  bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
+  bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI,
+                             bool EqTermsOnly);
   bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
                               const TargetTransformInfo &TTI);
   bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
@@ -1348,9 +1349,12 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I);
 
 /// Given a conditional branch that goes to BB1 and BB2, hoist any common code
 /// in the two blocks up into the branch block. The caller of this function
-/// guarantees that BI's block dominates BB1 and BB2.
+/// guarantees that BI's block dominates BB1 and BB2. If EqTermsOnly is given,
+/// only perform hoisting in case both blocks only contain a terminator. In that
+/// case, only the original BI will be replaced and selects for PHIs are added.
 bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
-                                           const TargetTransformInfo &TTI) {
+                                           const TargetTransformInfo &TTI,
+                                           bool EqTermsOnly) {
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  In particular, we don't want to get into
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
@@ -1387,6 +1391,16 @@ bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
       ++NumHoistCommonCode;
   });
 
+  // Check if only hoisting terminators is allowed. This does not add new
+  // instructions to the hoist location.
+  if (EqTermsOnly) {
+    if (!I1->isIdenticalToWhenDefined(I2))
+      return false;
+    if (!I1->isTerminator())
+      return false;
+    goto HoistTerminator;
+  }
+
   do {
     // If we are hoisting the terminator instruction, don't move one (making a
     // broken BB), instead clone it, and remove BI.
@@ -6498,9 +6512,9 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
-      if (HoistCommon && Options.HoistCommonInsts)
-        if (HoistThenElseCodeToIf(BI, TTI))
-          return requestResimplify();
+      if (HoistCommon &&
+          HoistThenElseCodeToIf(BI, TTI, !Options.HoistCommonInsts))
+        return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-required-for-vectorization.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+
+target triple = "arm64-apple-darwin"
+
+; Make sure we can vectorize a loop that uses a function to clamp a double to
+; be between a given minimum and maximum value.
+
+define internal double @clamp(double %v) {
+entry:
+  %retval = alloca double, align 8
+  %v.addr = alloca double, align 8
+  store double %v, double* %v.addr, align 8
+  %0 = load double, double* %v.addr, align 8
+  %cmp = fcmp olt double %0, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store double 0.000000e+00, double* %retval, align 8
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %1 = load double, double* %v.addr, align 8
+  %cmp1 = fcmp ogt double %1, 6.000000e+00
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store double 6.000000e+00, double* %retval, align 8
+  br label %return
+
+if.end3:                                          ; preds = %if.end
+  %2 = load double, double* %v.addr, align 8
+  store double %2, double* %retval, align 8
+  br label %return
+
+return:                                           ; preds = %if.end3, %if.then2, %if.then
+  %3 = load double, double* %retval, align 8
+  ret double %3
+}
+
+define void @loop(double* %X, double* %Y) {
+; CHECK-LABEL: @loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[X:%.*]], i64 20000
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = getelementptr double, double* [[Y:%.*]], i64 20000
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP9]], [[X]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[Y]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD11]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD11]], <double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP8]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP6]], <2 x double> zeroinitializer, <2 x double> [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double* [[TMP15]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP16]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000
+; CHECK-NEXT:    br i1 [[TMP17]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_05]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt double [[TMP18]], 0.000000e+00
+; CHECK-NEXT:    [[CMP1_I:%.*]] = fcmp ogt double [[TMP18]], 6.000000e+00
+; CHECK-NEXT:    [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP18]]
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store double [[RETVAL_0_I]], double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_05]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_05]], 19999
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+entry:
+  %X.addr = alloca double*, align 8
+  %Y.addr = alloca double*, align 8
+  %i = alloca i32, align 4
+  store double* %X, double** %X.addr, align 8
+  store double* %Y, double** %Y.addr, align 8
+  %0 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #2
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4
+  %cmp = icmp ult i32 %1, 20000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %2 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #2
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load double*, double** %Y.addr, align 8
+  %4 = load i32, i32* %i, align 4
+  %idxprom = zext i32 %4 to i64
+  %arrayidx = getelementptr inbounds double, double* %3, i64 %idxprom
+  %5 = load double, double* %arrayidx, align 8
+  %call = call double @clamp(double %5)
+  %6 = load double*, double** %X.addr, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom1 = zext i32 %7 to i64
+  %arrayidx2 = getelementptr inbounds double, double* %6, i64 %idxprom1
+  store double %call, double* %arrayidx2, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %8 = load i32, i32* %i, align 4
+  %inc = add i32 %8, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/lit.local.cfg b/llvm/test/Transforms/PhaseOrdering/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll b/llvm/test/Transforms/SimplifyCFG/common-code-hoisting.ll
@@ -108,3 +108,45 @@ for.end:
 return:
   ret void
 }
+
+; A example where only the branch instructions from %if.then2 and %if.end3 need
+; to be hoisted, which effectively replaces the original branch in %if.end and
+; only requires selects for PHIs in the successor.
+define float @clamp_float_value(float %value, float %minimum_value, float %maximum_value) {
+; HOIST-LABEL: @clamp_float_value(
+; HOIST-NEXT:  entry:
+; HOIST-NEXT:    [[CMP:%.*]] = fcmp ogt float [[VALUE:%.*]], [[MAXIMUM_VALUE:%.*]]
+; HOIST-NEXT:    [[CMP1:%.*]] = fcmp olt float [[VALUE]], [[MINIMUM_VALUE:%.*]]
+; HOIST-NEXT:    [[MINIMUM_VALUE_VALUE:%.*]] = select i1 [[CMP1]], float [[MINIMUM_VALUE]], float [[VALUE]]
+; HOIST-NEXT:    [[RETVAL_0:%.*]] = select i1 [[CMP]], float [[MAXIMUM_VALUE]], float [[MINIMUM_VALUE_VALUE]]
+; HOIST-NEXT:    ret float [[RETVAL_0]]
+;
+; NOHOIST-LABEL: @clamp_float_value(
+; NOHOIST-NEXT:  entry:
+; NOHOIST-NEXT:    [[CMP:%.*]] = fcmp ogt float [[VALUE:%.*]], [[MAXIMUM_VALUE:%.*]]
+; NOHOIST-NEXT:    [[CMP1:%.*]] = fcmp olt float [[VALUE]], [[MINIMUM_VALUE:%.*]]
+; NOHOIST-NEXT:    [[MINIMUM_VALUE_VALUE:%.*]] = select i1 [[CMP1]], float [[MINIMUM_VALUE]], float [[VALUE]]
+; NOHOIST-NEXT:    [[RETVAL_0:%.*]] = select i1 [[CMP]], float [[MAXIMUM_VALUE]], float [[MINIMUM_VALUE_VALUE]]
+; NOHOIST-NEXT:    ret float [[RETVAL_0]]
+;
+entry:
+  %cmp = fcmp ogt float %value, %maximum_value
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %cmp1 = fcmp olt float %value, %minimum_value
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  br label %return
+
+if.end3:                                          ; preds = %if.end
+  br label %return
+
+return:                                           ; preds = %if.end3, %if.then2, %if.then
+  %retval.0 = phi float [ %maximum_value, %if.then ], [ %minimum_value, %if.then2 ], [ %value, %if.end3 ]
+  ret float %retval.0
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+if not 'AArch64' in config.root.targets:`
	`2`	`+ config.unsupported = True`