Skip to content

Commit bb260eb

Browse files
authored
[CodeGen] Only deduplicate PHIs on critical edges (#97064)
PHIElim deduplicates identical PHI nodes to reduce the number of copies inserted. There are two cases: 1. Identical PHI nodes are in different blocks. That's the reason for this optimization; this can't be avoided at SSA-level. A necessary prerequisite for this is that the predecessors of all basic blocks (where such a PHI node could occur) are the same. This implies that all (>= 2) predecessors must have multiple successors, i.e. all edges into the block are critical edges. 2. Identical PHI nodes are in the same block. CSE can remove these. There are a few cases, however, where they still occur regardless: - expand-large-div-rem creates PHI nodes with large integers, which get lowered into one PHI per MVT. Later, some identical values (zeroes) get folded, resulting in identical PHI nodes. - peephole-opt occasionally inserts PHIs for the same value. - Some pseudo instruction emitters create redundant PHI nodes (e.g., AVR's insertShift), merging the same values more than once. In any case, this happens rarely and MachineCSE handles most cases anyway, so that PHIElim only gets to see very few of such cases (see changed test files). Currently, all PHI nodes are inserted into a DenseMap that checks equality not by pointer but by operands. This hash map is pretty expensive (hashing itself and the hash map), but only really useful in the first case. Avoid this expensive hashing most of the time by restricting it to basic blocks with only critical input edges. This improves performance for code with many PHI nodes, especially at -O0. (Note that Clang often doesn't generate PHI nodes and -O0 includes no mem2reg. Other compilers always generate PHI nodes.)
1 parent d8c0734 commit bb260eb

File tree

4 files changed

+361
-334
lines changed

4 files changed

+361
-334
lines changed

llvm/lib/CodeGen/PHIElimination.cpp

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ namespace {
8383
bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB);
8484

8585
void LowerPHINode(MachineBasicBlock &MBB,
86-
MachineBasicBlock::iterator LastPHIIt);
86+
MachineBasicBlock::iterator LastPHIIt,
87+
bool AllEdgesCritical);
8788

8889
/// analyzePHINodes - Gather information about the PHI nodes in
8990
/// here. In particular, we want to map the number of uses of a virtual
@@ -191,7 +192,8 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
191192
MRI->leaveSSA();
192193

193194
// Populate VRegPHIUseCount
194-
analyzePHINodes(MF);
195+
if (LV || LIS)
196+
analyzePHINodes(MF);
195197

196198
// Eliminate PHI instructions by inserting copies into predecessor blocks.
197199
for (auto &MBB : MF)
@@ -239,8 +241,20 @@ bool PHIElimination::EliminatePHINodes(MachineFunction &MF,
239241
MachineBasicBlock::iterator LastPHIIt =
240242
std::prev(MBB.SkipPHIsAndLabels(MBB.begin()));
241243

244+
// If all incoming edges are critical, we try to deduplicate identical PHIs so
245+
// that we generate fewer copies. If at any edge is non-critical, we either
246+
// have less than two predecessors (=> no PHIs) or a predecessor has only us
247+
// as a successor (=> identical PHI node can't occur in different block).
248+
bool AllEdgesCritical = MBB.pred_size() >= 2;
249+
for (MachineBasicBlock *Pred : MBB.predecessors()) {
250+
if (Pred->succ_size() < 2) {
251+
AllEdgesCritical = false;
252+
break;
253+
}
254+
}
255+
242256
while (MBB.front().isPHI())
243-
LowerPHINode(MBB, LastPHIIt);
257+
LowerPHINode(MBB, LastPHIIt, AllEdgesCritical);
244258

245259
return true;
246260
}
@@ -267,7 +281,8 @@ static bool allPhiOperandsUndefined(const MachineInstr &MPhi,
267281
}
268282
/// LowerPHINode - Lower the PHI node at the top of the specified block.
269283
void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
270-
MachineBasicBlock::iterator LastPHIIt) {
284+
MachineBasicBlock::iterator LastPHIIt,
285+
bool AllEdgesCritical) {
271286
++NumLowered;
272287

273288
MachineBasicBlock::iterator AfterPHIsIt = std::next(LastPHIIt);
@@ -283,6 +298,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
283298
// Create a new register for the incoming PHI arguments.
284299
MachineFunction &MF = *MBB.getParent();
285300
unsigned IncomingReg = 0;
301+
bool EliminateNow = true; // delay elimination of nodes in LoweredPHIs
286302
bool reusedIncoming = false; // Is IncomingReg reused from an earlier PHI?
287303

288304
// Insert a register to register copy at the top of the current block (but
@@ -297,19 +313,28 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
297313
TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
298314
else {
299315
// Can we reuse an earlier PHI node? This only happens for critical edges,
300-
// typically those created by tail duplication.
301-
unsigned &entry = LoweredPHIs[MPhi];
302-
if (entry) {
316+
// typically those created by tail duplication. Typically, an identical PHI
317+
// node can't occur, so avoid hashing/storing such PHIs, which is somewhat
318+
// expensive.
319+
unsigned *Entry = nullptr;
320+
if (AllEdgesCritical)
321+
Entry = &LoweredPHIs[MPhi];
322+
if (Entry && *Entry) {
303323
// An identical PHI node was already lowered. Reuse the incoming register.
304-
IncomingReg = entry;
324+
IncomingReg = *Entry;
305325
reusedIncoming = true;
306326
++NumReused;
307327
LLVM_DEBUG(dbgs() << "Reusing " << printReg(IncomingReg) << " for "
308328
<< *MPhi);
309329
} else {
310330
const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
311-
entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
331+
IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
332+
if (Entry) {
333+
EliminateNow = false;
334+
*Entry = IncomingReg;
335+
}
312336
}
337+
313338
// Give the target possiblity to handle special cases fallthrough otherwise
314339
PHICopy = TII->createPHIDestinationCopy(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
315340
IncomingReg, DestReg);
@@ -445,11 +470,13 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
445470
}
446471

447472
// Adjust the VRegPHIUseCount map to account for the removal of this PHI node.
448-
for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
449-
if (!MPhi->getOperand(i).isUndef()) {
450-
--VRegPHIUseCount[BBVRegPair(
451-
MPhi->getOperand(i + 1).getMBB()->getNumber(),
452-
MPhi->getOperand(i).getReg())];
473+
if (LV || LIS) {
474+
for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
475+
if (!MPhi->getOperand(i).isUndef()) {
476+
--VRegPHIUseCount[BBVRegPair(
477+
MPhi->getOperand(i + 1).getMBB()->getNumber(),
478+
MPhi->getOperand(i).getReg())];
479+
}
453480
}
454481
}
455482

@@ -646,7 +673,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
646673
}
647674

648675
// Really delete the PHI instruction now, if it is not in the LoweredPHIs map.
649-
if (reusedIncoming || !IncomingReg) {
676+
if (EliminateNow) {
650677
if (LIS)
651678
LIS->RemoveMachineInstrFromMaps(*MPhi);
652679
MF.deleteMachineInstr(MPhi);

llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
804804
; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
805805
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 $exec, -1, implicit-def dead $scc
806806
; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
807+
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
807808
; GFX90A-NEXT: S_BRANCH %bb.61
808809
; GFX90A-NEXT: {{ $}}
809810
; GFX90A-NEXT: bb.58:
@@ -886,11 +887,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
886887
; GFX90A-NEXT: {{ $}}
887888
; GFX90A-NEXT: bb.61.Flow31:
888889
; GFX90A-NEXT: successors: %bb.62(0x80000000)
889-
; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
890+
; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
890891
; GFX90A-NEXT: {{ $}}
891892
; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def $scc
892893
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0
893-
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
894894
; GFX90A-NEXT: {{ $}}
895895
; GFX90A-NEXT: bb.62.Flow30:
896896
; GFX90A-NEXT: successors: %bb.56(0x80000000)

0 commit comments

Comments
 (0)