Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 5f81eab

Browse files
committed
[x86] Teach the EFLAGS copy lowering to handle much more complex control
flow patterns including forks, merges, and even cyles. This tries to cover a reasonably comprehensive set of patterns that still don't require PHIs or PHI placement. The coverage was inspired by the amazing variety of patterns produced when copy EFLAGS and restoring it to implement Speculative Load Hardening. Without this patch, we simply cannot make such complex and invasive changes to x86 instruction sequences due to EFLAGS. I've added "just" one test, but this test covers many different complexities and corner cases of this approach. It is actually more comprehensive, as far as I can tell, than anything that I have encountered in the wild on SLH. Because the test is so complex, I've tried to give somewhat thorough comments and an ASCII-art diagram of the control flows to make it a bit easier to read and maintain long-term. Differential Revision: https://reviews.llvm.org/D49220 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@336985 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent ed770b6 commit 5f81eab

File tree

2 files changed

+359
-44
lines changed

2 files changed

+359
-44
lines changed

lib/Target/X86/X86FlagsCopyLowering.cpp

Lines changed: 161 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "X86Subtarget.h"
2828
#include "llvm/ADT/ArrayRef.h"
2929
#include "llvm/ADT/DenseMap.h"
30+
#include "llvm/ADT/PostOrderIterator.h"
3031
#include "llvm/ADT/STLExtras.h"
3132
#include "llvm/ADT/ScopeExit.h"
3233
#include "llvm/ADT/SmallPtrSet.h"
@@ -102,7 +103,7 @@ class X86FlagsCopyLoweringPass : public MachineFunctionPass {
102103
MachineDominatorTree *MDT;
103104

104105
CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
105-
MachineInstr &CopyDefI);
106+
MachineBasicBlock::iterator CopyDefI);
106107

107108
unsigned promoteCondToReg(MachineBasicBlock &MBB,
108109
MachineBasicBlock::iterator TestPos,
@@ -356,9 +357,14 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
356357
// Nothing to do for a degenerate empty function...
357358
return false;
358359

360+
// Collect the copies in RPO so that when there are chains where a copy is in
361+
// turn copied again we visit the first one first. This ensures we can find
362+
// viable locations for testing the original EFLAGS that dominate all the
363+
// uses across complex CFGs.
359364
SmallVector<MachineInstr *, 4> Copies;
360-
for (MachineBasicBlock &MBB : MF)
361-
for (MachineInstr &MI : MBB)
365+
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
366+
for (MachineBasicBlock *MBB : RPOT)
367+
for (MachineInstr &MI : *MBB)
362368
if (MI.getOpcode() == TargetOpcode::COPY &&
363369
MI.getOperand(0).getReg() == X86::EFLAGS)
364370
Copies.push_back(&MI);
@@ -407,12 +413,99 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
407413
if (DOp.isDead())
408414
continue;
409415

410-
MachineBasicBlock &TestMBB = *CopyDefI.getParent();
416+
MachineBasicBlock *TestMBB = CopyDefI.getParent();
411417
auto TestPos = CopyDefI.getIterator();
412418
DebugLoc TestLoc = CopyDefI.getDebugLoc();
413419

414420
LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
415421

422+
// Walk up across live-in EFLAGS to find where they were actually def'ed.
423+
//
424+
// This copy's def may just be part of a region of blocks covered by
425+
// a single def of EFLAGS and we want to find the top of that region where
426+
// possible.
427+
//
428+
// This is essentially a search for a *candidate* reaching definition
429+
// location. We don't need to ever find the actual reaching definition here,
430+
// but we want to walk up the dominator tree to find the highest point which
431+
// would be viable for such a definition.
432+
auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
433+
MachineBasicBlock::iterator End) {
434+
// Scan backwards as we expect these to be relatively short and often find
435+
// a clobber near the end.
436+
return llvm::any_of(
437+
llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
438+
// Flag any instruction (other than the copy we are
439+
// currently rewriting) that defs EFLAGS.
440+
return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
441+
});
442+
};
443+
auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
444+
MachineBasicBlock *EndMBB) {
445+
assert(MDT->dominates(BeginMBB, EndMBB) &&
446+
"Only support paths down the dominator tree!");
447+
SmallPtrSet<MachineBasicBlock *, 4> Visited;
448+
SmallVector<MachineBasicBlock *, 4> Worklist;
449+
// We terminate at the beginning. No need to scan it.
450+
Visited.insert(BeginMBB);
451+
Worklist.push_back(EndMBB);
452+
do {
453+
auto *MBB = Worklist.pop_back_val();
454+
for (auto *PredMBB : MBB->predecessors()) {
455+
if (!Visited.insert(PredMBB).second)
456+
continue;
457+
if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
458+
return true;
459+
// Enqueue this block to walk its predecessors.
460+
Worklist.push_back(PredMBB);
461+
}
462+
} while (!Worklist.empty());
463+
// No clobber found along a path from the begin to end.
464+
return false;
465+
};
466+
while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
467+
!HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
468+
// Find the nearest common dominator of the predecessors, as
469+
// that will be the best candidate to hoist into.
470+
MachineBasicBlock *HoistMBB =
471+
std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
472+
*TestMBB->pred_begin(),
473+
[&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
474+
return MDT->findNearestCommonDominator(LHS, RHS);
475+
});
476+
477+
// Now we need to scan all predecessors that may be reached along paths to
478+
// the hoist block. A clobber anywhere in any of these blocks the hoist.
479+
// Note that this even handles loops because we require *no* clobbers.
480+
if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
481+
break;
482+
483+
// We also need the terminators to not sneakily clobber flags.
484+
if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
485+
HoistMBB->instr_end()))
486+
break;
487+
488+
// We found a viable location, hoist our test position to it.
489+
TestMBB = HoistMBB;
490+
TestPos = TestMBB->getFirstTerminator()->getIterator();
491+
// Clear the debug location as it would just be confusing after hoisting.
492+
TestLoc = DebugLoc();
493+
}
494+
LLVM_DEBUG({
495+
auto DefIt = llvm::find_if(
496+
llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
497+
[&](MachineInstr &MI) {
498+
return MI.findRegisterDefOperand(X86::EFLAGS);
499+
});
500+
if (DefIt.base() != TestMBB->instr_begin()) {
501+
dbgs() << " Using EFLAGS defined by: ";
502+
DefIt->dump();
503+
} else {
504+
dbgs() << " Using live-in flags for BB:\n";
505+
TestMBB->dump();
506+
}
507+
});
508+
416509
// While rewriting uses, we buffer jumps and rewrite them in a second pass
417510
// because doing so will perturb the CFG that we are walking to find the
418511
// uses in the first place.
@@ -423,52 +516,47 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
423516
// very few of them and we expect to not revisit the same copy definition
424517
// many times. If either of those change sufficiently we could build a map
425518
// of these up front instead.
426-
CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
519+
CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
427520

428521
// Collect the basic blocks we need to scan. Typically this will just be
429522
// a single basic block but we may have to scan multiple blocks if the
430523
// EFLAGS copy lives into successors.
431524
SmallVector<MachineBasicBlock *, 2> Blocks;
432525
SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
433526
Blocks.push_back(&MBB);
434-
VisitedBlocks.insert(&MBB);
435527

436528
do {
437529
MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
438530

439531
// Track when if/when we find a kill of the flags in this block.
440532
bool FlagsKilled = false;
441533

442-
// We currently don't do any PHI insertion and so we require that the
443-
// test basic block dominates all of the use basic blocks.
444-
//
445-
// We could in theory do PHI insertion here if it becomes useful by just
446-
// taking undef values in along every edge that we don't trace this
447-
// EFLAGS copy along. This isn't as bad as fully general PHI insertion,
448-
// but still seems like a great deal of complexity.
449-
//
450-
// Because it is theoretically possible that some earlier MI pass or
451-
// other lowering transformation could induce this to happen, we do
452-
// a hard check even in non-debug builds here.
453-
if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
454-
LLVM_DEBUG({
455-
dbgs() << "ERROR: Encountered use that is not dominated by our test "
456-
"basic block! Rewriting this would require inserting PHI "
457-
"nodes to track the flag state across the CFG.\n\nTest "
458-
"block:\n";
459-
TestMBB.dump();
460-
dbgs() << "Use block:\n";
461-
UseMBB.dump();
462-
});
463-
report_fatal_error("Cannot lower EFLAGS copy when original copy def "
464-
"does not dominate all uses.");
465-
}
466-
467-
for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
468-
: UseMBB.instr_begin(),
534+
// In most cases, we walk from the beginning to the end of the block. But
535+
// when the block is the same block as the copy is from, we will visit it
536+
// twice. The first time we start from the copy and go to the end. The
537+
// second time we start from the beginning and go to the copy. This lets
538+
// us handle copies inside of cycles.
539+
// FIXME: This loop is *super* confusing. This is at least in part
540+
// a symptom of all of this routine needing to be refactored into
541+
// documentable components. Once done, there may be a better way to write
542+
// this loop.
543+
for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
544+
? std::next(CopyI->getIterator())
545+
: UseMBB.instr_begin(),
469546
MIE = UseMBB.instr_end();
470547
MII != MIE;) {
471548
MachineInstr &MI = *MII++;
549+
// If we are in the original copy block and encounter either the copy
550+
// def or the copy itself, break so that we don't re-process any part of
551+
// the block or process the instructions in the range that was copied
552+
// over.
553+
if (&MI == CopyI || &MI == &CopyDefI) {
554+
assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
555+
"Should only encounter these on the second pass over the "
556+
"original block.");
557+
break;
558+
}
559+
472560
MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
473561
if (!FlagUse) {
474562
if (MI.findRegisterDefOperand(X86::EFLAGS)) {
@@ -512,10 +600,10 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
512600

513601
// Otherwise we can just rewrite in-place.
514602
if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
515-
rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
603+
rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
516604
} else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
517605
X86::COND_INVALID) {
518-
rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
606+
rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
519607
} else if (MI.getOpcode() == TargetOpcode::COPY) {
520608
rewriteCopy(MI, *FlagUse, CopyDefI);
521609
} else {
@@ -538,13 +626,13 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
538626
case X86::SETB_C64r:
539627
// Use custom lowering for arithmetic that is merely extending the
540628
// carry flag. We model this as the SETB_C* pseudo instructions.
541-
rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
629+
rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
542630
CondRegs);
543631
break;
544632

545633
default:
546634
// Generically handle remaining uses as arithmetic instructions.
547-
rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
635+
rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
548636
CondRegs);
549637
break;
550638
}
@@ -564,8 +652,38 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
564652
// and queue those up for processing.
565653
for (MachineBasicBlock *SuccMBB : UseMBB.successors())
566654
if (SuccMBB->isLiveIn(X86::EFLAGS) &&
567-
VisitedBlocks.insert(SuccMBB).second)
655+
VisitedBlocks.insert(SuccMBB).second) {
656+
// We currently don't do any PHI insertion and so we require that the
657+
// test basic block dominates all of the use basic blocks. Further, we
658+
// can't have a cycle from the test block back to itself as that would
659+
// create a cycle requiring a PHI to break it.
660+
//
661+
// We could in theory do PHI insertion here if it becomes useful by
662+
// just taking undef values in along every edge that we don't trace
663+
// this EFLAGS copy along. This isn't as bad as fully general PHI
664+
// insertion, but still seems like a great deal of complexity.
665+
//
666+
// Because it is theoretically possible that some earlier MI pass or
667+
// other lowering transformation could induce this to happen, we do
668+
// a hard check even in non-debug builds here.
669+
if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
670+
LLVM_DEBUG({
671+
dbgs()
672+
<< "ERROR: Encountered use that is not dominated by our test "
673+
"basic block! Rewriting this would require inserting PHI "
674+
"nodes to track the flag state across the CFG.\n\nTest "
675+
"block:\n";
676+
TestMBB->dump();
677+
dbgs() << "Use block:\n";
678+
SuccMBB->dump();
679+
});
680+
report_fatal_error(
681+
"Cannot lower EFLAGS copy when original copy def "
682+
"does not dominate all uses.");
683+
}
684+
568685
Blocks.push_back(SuccMBB);
686+
}
569687
} while (!Blocks.empty());
570688

571689
// Now rewrite the jumps that use the flags. These we handle specially
@@ -580,7 +698,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
580698
else
581699
LastJmpMBB = JmpI->getParent();
582700

583-
rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
701+
rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
584702
}
585703

586704
// FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
@@ -604,14 +722,13 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
604722

605723
/// Collect any conditions that have already been set in registers so that we
606724
/// can re-use them rather than adding duplicates.
607-
CondRegArray
608-
X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
609-
MachineInstr &CopyDefI) {
725+
CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
726+
MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
610727
CondRegArray CondRegs = {};
611728

612729
// Scan backwards across the range of instructions with live EFLAGS.
613-
for (MachineInstr &MI : llvm::reverse(
614-
llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
730+
for (MachineInstr &MI :
731+
llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
615732
X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
616733
if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
617734
TRI->isVirtualRegister(MI.getOperand(0).getReg()))

0 commit comments

Comments
 (0)