[Sink] Optimize/simplify sink candidate finding with nearest common dominator

MaskRay · MaskRay · commit f2284e3405d8 · 2020-08-30T22:51:00.000-07:00
For an instruction in the basic block BB, SinkingPass enumerates basic blocks
dominated by BB and BB's successors. For each enumerated basic block,
SinkingPass uses `AllUsesDominatedByBlock` to check whether the basic
block dominates all of the instruction's users. This is inefficient.

Use the nearest common dominator of all users to avoid enumerating the
candidate. The nearest common dominator may be in a parent loop which is
not beneficial. In that case, find the ancestors in the dominator tree.

In the case that the instruction has no user, with this change we will
not perform unnecessary move. This causes some amdgpu test changes.

A stage-2 x86-64 clang is a byte identical with this change.
diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -32,31 +32,6 @@ using namespace llvm;
 STATISTIC(NumSunk, "Number of instructions sunk");
 STATISTIC(NumSinkIter, "Number of sinking iterations");
 
-/// AllUsesDominatedByBlock - Return true if all uses of the specified value
-/// occur in blocks dominated by the specified block.
-static bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB,
-                                    DominatorTree &DT) {
-  // Ignoring debug uses is necessary so debug info doesn't affect the code.
-  // This may leave a referencing dbg_value in the original block, before
-  // the definition of the vreg.  Dwarf generator handles this although the
-  // user might not get the right info at runtime.
-  for (Use &U : Inst->uses()) {
-    // Determine the block of the use.
-    Instruction *UseInst = cast<Instruction>(U.getUser());
-    BasicBlock *UseBlock = UseInst->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
-      // PHI nodes use the operand in the predecessor block, not the block with
-      // the PHI.
-      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
-      UseBlock = PN->getIncomingBlock(Num);
-    }
-    // Check that it dominates.
-    if (!DT.dominates(BB, UseBlock))
-      return false;
-  }
-  return true;
-}
-
 static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
                          SmallPtrSetImpl<Instruction *> &Stores) {
 
@@ -97,11 +72,6 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
   assert(Inst && "Instruction to be sunk is null");
   assert(SuccToSinkTo && "Candidate sink target is null");
 
-  // It is not possible to sink an instruction into its own block.  This can
-  // happen with loops.
-  if (Inst->getParent() == SuccToSinkTo)
-    return false;
-
   // It's never legal to sink an instruction into a block which terminates in an
   // EH-pad.
   if (SuccToSinkTo->getTerminator()->isExceptionalTerminator())
@@ -129,9 +99,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
       return false;
   }
 
-  // Finally, check that all the uses of the instruction are actually
-  // dominated by the candidate
-  return AllUsesDominatedByBlock(Inst, SuccToSinkTo, DT);
+  return true;
 }
 
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
@@ -162,25 +130,34 @@ static bool SinkInstruction(Instruction *Inst,
   // decide.
   BasicBlock *SuccToSinkTo = nullptr;
 
-  // Instructions can only be sunk if all their uses are in blocks
-  // dominated by one of the successors.
-  // Look at all the dominated blocks and see if we can sink it in one.
-  DomTreeNode *DTN = DT.getNode(Inst->getParent());
-  for (auto I = DTN->begin(), E = DTN->end(); I != E && SuccToSinkTo == nullptr;
-       ++I) {
-    BasicBlock *Candidate = (*I)->getBlock();
-    // A node always immediate-dominates its children on the dominator
-    // tree.
-    if (IsAcceptableTarget(Inst, Candidate, DT, LI))
-      SuccToSinkTo = Candidate;
+  // Find the nearest common dominator of all users as the candidate.
+  BasicBlock *BB = Inst->getParent();
+  for (Use &U : Inst->uses()) {
+    Instruction *UseInst = cast<Instruction>(U.getUser());
+    BasicBlock *UseBlock = UseInst->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+      UseBlock = PN->getIncomingBlock(Num);
+    }
+    if (SuccToSinkTo)
+      SuccToSinkTo = DT.findNearestCommonDominator(SuccToSinkTo, UseBlock);
+    else
+      SuccToSinkTo = UseBlock;
+    // The current basic block needs to dominate the candidate.
+    if (!DT.dominates(BB, SuccToSinkTo))
+      return false;
   }
 
-  // If no suitable postdominator was found, look at all the successors and
-  // decide which one we should sink to, if any.
-  for (succ_iterator I = succ_begin(Inst->getParent()),
-      E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
-    if (IsAcceptableTarget(Inst, *I, DT, LI))
-      SuccToSinkTo = *I;
+  if (SuccToSinkTo) {
+    // The nearest common dominator may be in a parent loop of BB, which may not
+    // be beneficial. Find an ancestor.
+    while (SuccToSinkTo != BB &&
+           !IsAcceptableTarget(Inst, SuccToSinkTo, DT, LI))
+      SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
+    if (SuccToSinkTo == BB)
+      SuccToSinkTo = nullptr;
   }
 
   // If we couldn't find a block to sink to, ignore this instruction.
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -240,15 +240,15 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
-; GCN-IR-NEXT:  BB0_7: ; %Flow7
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[8:9], s[2:3]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 %x, %y
@@ -411,26 +411,26 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v10
 ; GCN-IR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v14, v7, v0, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v11, vcc, v13, v14
-; GCN-IR-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[11:12]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[11:12]
+; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v13, v14
+; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[7:8]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[7:8]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v18, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v10, 0, s[6:7]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v12, v10, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v15, v18
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v9, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v11
-; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v12, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v0, s[4:5], 63, v11
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[16:17], v[11:12]
+; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v7
+; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v8, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v0, s[4:5], 63, v7
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[9:10], v0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
@@ -480,14 +480,14 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  BB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[7:8], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v12, v3
+; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v3
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v11, v2
 ; GCN-IR-NEXT:  BB1_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v2, v5, v4
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GCN-IR-NEXT:    v_xor_b32_e32 v3, v7, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v3, v12, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
@@ -1111,7 +1111,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
-; GCN-IR-NEXT:  BB9_7: ; %Flow4
+; GCN-IR-NEXT:  BB9_7: ; %udiv-end
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[2:3]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -1341,9 +1341,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 24, %x
diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -397,9 +397,9 @@ endif:
 }
 
 ; FUNC-LABEL: setcc-i1-and-xor
-; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
-; GCN: s_and_b64 s[2:3], [[A]], [[B]]
+; GCN-DAG: v_cmp_nge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
+; GCN-DAG: v_cmp_nle_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
+; GCN: s_or_b64 s[2:3], [[A]], [[B]]
 define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
 bb0:
   %tmp5 = fcmp oge float %cond, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -221,11 +221,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; SI-NEXT:    s_and_b64 vcc, exec, s[14:15]
 ; SI-NEXT:    s_cbranch_vccz BB3_13
 ; SI-NEXT:  ; %bb.10: ; %for.cond.preheader
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, 0x3e8
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, s8, v0
-; SI-NEXT:    s_and_b64 vcc, exec, vcc
-; SI-NEXT:    s_cbranch_vccz BB3_13
+; SI-NEXT:    s_cmpk_lt_i32 s8, 0x3e8
+; SI-NEXT:    s_cbranch_scc0 BB3_13
 ; SI-NEXT:  ; %bb.11: ; %for.body
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
 ; SI-NEXT:  BB3_12: ; %self.loop
@@ -295,10 +292,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; FLAT-NEXT:    s_and_b64 vcc, exec, s[14:15]
 ; FLAT-NEXT:    s_cbranch_vccz BB3_13
 ; FLAT-NEXT:  ; %bb.10: ; %for.cond.preheader
-; FLAT-NEXT:    v_mov_b32_e32 v0, 0x3e8
-; FLAT-NEXT:    v_cmp_lt_i32_e32 vcc, s8, v0
-; FLAT-NEXT:    s_and_b64 vcc, exec, vcc
-; FLAT-NEXT:    s_cbranch_vccz BB3_13
+; FLAT-NEXT:    s_cmpk_lt_i32 s8, 0x3e8
+; FLAT-NEXT:    s_cbranch_scc0 BB3_13
 ; FLAT-NEXT:  ; %bb.11: ; %for.body
 ; FLAT-NEXT:    s_and_b64 vcc, exec, 0
 ; FLAT-NEXT:  BB3_12: ; %self.loop
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll