Skip to content

Commit 0214b16

Browse files
committed
[MachineCombiner] Add a pass to reassociate chains of accumulation instructions into a tree
This pass is designed to increase ILP by performing accumulation into multiple registers. It currently supports only the UABAL accumulation instruction, but can easily be extended to support additional instructions.
1 parent b56e716 commit 0214b16

File tree

7 files changed

+663
-32
lines changed

7 files changed

+663
-32
lines changed

llvm/include/llvm/CodeGen/MachineCombinerPattern.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ enum MachineCombinerPattern : unsigned {
3232
REASSOC_AX_YB,
3333
REASSOC_XA_BY,
3434
REASSOC_XA_YB,
35+
ACC_CHAIN,
3536

3637
TARGET_PATTERN_START
3738
};

llvm/include/llvm/CodeGen/TargetInstrInfo.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <cassert>
3636
#include <cstddef>
3737
#include <cstdint>
38+
#include <optional>
3839
#include <utility>
3940
#include <vector>
4041

@@ -1275,6 +1276,39 @@ class TargetInstrInfo : public MCInstrInfo {
12751276
return false;
12761277
}
12771278

1279+
/// Find chains of accumulations that can be rewritten as a tree for increased
1280+
/// ILP.
1281+
bool getAccumulatorReassociationPatterns(
1282+
MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns) const;
1283+
1284+
/// Find the chain of accumulator instructions in \P MBB and return them in
1285+
/// \P Chain.
1286+
void getAccumulatorChain(MachineInstr *CurrentInstr,
1287+
SmallVectorImpl<Register> &Chain) const;
1288+
1289+
/// Return true when \P OpCode is an instruction which performs
1290+
/// accumulation into one of its operand registers.
1291+
virtual bool isAccumulationOpcode(unsigned Opcode) const { return false; }
1292+
1293+
/// Returns an opcode which defines the accumulator used by \P Opcode.
1294+
virtual std::optional<unsigned>
1295+
getAccumulationStartOpcode(unsigned Opcode) const {
1296+
return std::nullopt;
1297+
}
1298+
1299+
virtual std::optional<unsigned>
1300+
getReduceOpcodeForAccumulator(unsigned int AccumulatorOpCode) const {
1301+
return std::nullopt;
1302+
}
1303+
1304+
/// Reduces branches of the accumulator tree into a single register.
1305+
void reduceAccumulatorTree(SmallVectorImpl<Register> &RegistersToReduce,
1306+
SmallVectorImpl<MachineInstr *> &InsInstrs,
1307+
MachineFunction &MF, MachineInstr &Root,
1308+
MachineRegisterInfo &MRI,
1309+
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
1310+
Register ResultReg) const;
1311+
12781312
/// Return the inverse operation opcode if it exists for \P Opcode (e.g. add
12791313
/// for sub and vice versa).
12801314
virtual std::optional<unsigned> getInverseOpcode(unsigned Opcode) const {

llvm/lib/CodeGen/TargetInstrInfo.cpp

Lines changed: 258 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "llvm/CodeGen/TargetInstrInfo.h"
14+
#include "llvm/ADT/SmallSet.h"
1415
#include "llvm/ADT/StringExtras.h"
1516
#include "llvm/BinaryFormat/Dwarf.h"
1617
#include "llvm/CodeGen/MachineCombinerPattern.h"
@@ -42,6 +43,19 @@ static cl::opt<bool> DisableHazardRecognizer(
4243
"disable-sched-hazard", cl::Hidden, cl::init(false),
4344
cl::desc("Disable hazard detection during preRA scheduling"));
4445

46+
static cl::opt<bool> EnableAccReassociation(
47+
"acc-reassoc", cl::Hidden, cl::init(true),
48+
cl::desc("Enable reassociation of accumulation chains"));
49+
50+
static cl::opt<unsigned int>
51+
MinAccumulatorDepth("acc-min-depth", cl::Hidden, cl::init(8),
52+
cl::desc("Minimum length of accumulator chains "
53+
"required for the optimization to kick in"));
54+
55+
static cl::opt<unsigned int> MaxAccumulatorWidth(
56+
"acc-max-width", cl::Hidden, cl::init(3),
57+
cl::desc("Maximum number of branches in the accumulator tree"));
58+
4559
TargetInstrInfo::~TargetInstrInfo() = default;
4660

4761
const TargetRegisterClass*
@@ -899,6 +913,152 @@ bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst,
899913
hasReassociableSibling(Inst, Commuted);
900914
}
901915

916+
// Utility routine that checks if \param MO is defined by an
917+
// \param CombineOpc instruction in the basic block \param MBB
918+
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
919+
unsigned CombineOpc) {
920+
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
921+
MachineInstr *MI = nullptr;
922+
923+
if (MO.isReg() && MO.getReg().isVirtual())
924+
MI = MRI.getUniqueVRegDef(MO.getReg());
925+
// And it needs to be in the trace (otherwise, it won't have a depth).
926+
if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
927+
return false;
928+
// Must only used by the user we combine with.
929+
if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
930+
return false;
931+
932+
return true;
933+
}
934+
935+
// A chain of accumulation instructions will be selected IFF:
936+
// 1. All the accumulation instructions in the chain have the same opcode,
937+
// besides the first that has a slightly different opcode because it does
938+
// not perform the accumulation, just defines it.
939+
// 2. All the instructions in the chain are combinable (have a single use
940+
// which itself is part of the chain).
941+
// 3. Meets the required minimum length.
942+
void TargetInstrInfo::getAccumulatorChain(
943+
MachineInstr *CurrentInstr, SmallVectorImpl<Register> &Chain) const {
944+
// Walk up the chain of accumulation instructions and collect them in the
945+
// vector.
946+
MachineBasicBlock &MBB = *CurrentInstr->getParent();
947+
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
948+
unsigned AccumulatorOpcode = CurrentInstr->getOpcode();
949+
std::optional<unsigned> ChainStartOpCode =
950+
getAccumulationStartOpcode(AccumulatorOpcode);
951+
952+
if (!ChainStartOpCode.has_value())
953+
return;
954+
955+
while (CurrentInstr &&
956+
(canCombine(MBB, CurrentInstr->getOperand(1), AccumulatorOpcode) ||
957+
canCombine(MBB, CurrentInstr->getOperand(1),
958+
ChainStartOpCode.value()))) {
959+
Chain.push_back(CurrentInstr->getOperand(0).getReg());
960+
CurrentInstr = MRI.getUniqueVRegDef(CurrentInstr->getOperand(1).getReg());
961+
}
962+
963+
// Add the instruction at the top of the chain.
964+
if (CurrentInstr->getOpcode() == ChainStartOpCode.value())
965+
Chain.push_back(CurrentInstr->getOperand(0).getReg());
966+
}
967+
968+
/// Find chains of accumulations that can be rewritten as a tree for increased
969+
/// ILP.
970+
bool TargetInstrInfo::getAccumulatorReassociationPatterns(
971+
MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns) const {
972+
if (!EnableAccReassociation)
973+
return false;
974+
975+
unsigned Opc = Root.getOpcode();
976+
if (!isAccumulationOpcode(Opc))
977+
return false;
978+
979+
// Verify that this is the end of the chain.
980+
MachineBasicBlock &MBB = *Root.getParent();
981+
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
982+
if (!MRI.hasOneNonDBGUser(Root.getOperand(0).getReg()))
983+
return false;
984+
985+
auto User = MRI.use_instr_begin(Root.getOperand(0).getReg());
986+
if (User->getOpcode() == Opc)
987+
return false;
988+
989+
// Walk up the use chain and collect the reduction chain.
990+
SmallVector<Register, 32> Chain;
991+
getAccumulatorChain(&Root, Chain);
992+
993+
// Reject chains which are too short to be worth modifying.
994+
if (Chain.size() < MinAccumulatorDepth)
995+
return false;
996+
997+
// Check if the MBB this instruction is a part of contains any other chains.
998+
// If so, don't apply it.
999+
SmallSet<Register, 32> ReductionChain(Chain.begin(), Chain.end());
1000+
for (const auto &I : MBB) {
1001+
if (I.getOpcode() == Opc &&
1002+
!ReductionChain.contains(I.getOperand(0).getReg()))
1003+
return false;
1004+
}
1005+
1006+
Patterns.push_back(MachineCombinerPattern::ACC_CHAIN);
1007+
return true;
1008+
}
1009+
1010+
// Reduce branches of the accumulator tree by adding them together.
1011+
void TargetInstrInfo::reduceAccumulatorTree(
1012+
SmallVectorImpl<Register> &RegistersToReduce,
1013+
SmallVectorImpl<MachineInstr *> &InsInstrs, MachineFunction &MF,
1014+
MachineInstr &Root, MachineRegisterInfo &MRI,
1015+
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
1016+
Register ResultReg) const {
1017+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
1018+
SmallVector<Register, 8> NewRegs;
1019+
1020+
// Get the opcode for the reduction instruction we will need to build.
1021+
// If for some reason it is not defined, early exit and don't apply this.
1022+
std::optional<unsigned> ReduceOpCode =
1023+
getReduceOpcodeForAccumulator(Root.getOpcode());
1024+
1025+
if (!ReduceOpCode.value())
1026+
return;
1027+
1028+
for (unsigned int i = 1; i <= (RegistersToReduce.size() / 2); i += 2) {
1029+
auto RHS = RegistersToReduce[i - 1];
1030+
auto LHS = RegistersToReduce[i];
1031+
Register Dest;
1032+
// If we are reducing 2 registers, reuse the original result register.
1033+
if (RegistersToReduce.size() == 2)
1034+
Dest = ResultReg;
1035+
// Otherwise, create a new virtual register to hold the partial sum.
1036+
else {
1037+
auto NewVR = MRI.createVirtualRegister(
1038+
MRI.getRegClass(Root.getOperand(0).getReg()));
1039+
Dest = NewVR;
1040+
NewRegs.push_back(Dest);
1041+
InstrIdxForVirtReg.insert(std::make_pair(Dest, InsInstrs.size()));
1042+
}
1043+
1044+
// Create the new reduction instruction.
1045+
MachineInstrBuilder MIB =
1046+
BuildMI(MF, MIMetadata(Root), TII->get(ReduceOpCode.value()), Dest)
1047+
.addReg(RHS, getKillRegState(true))
1048+
.addReg(LHS, getKillRegState(true));
1049+
// Copy any flags needed from the original instruction.
1050+
MIB->setFlags(Root.getFlags());
1051+
InsInstrs.push_back(MIB);
1052+
}
1053+
1054+
// If the number of registers to reduce is odd, add the reminaing register to
1055+
// the vector of registers to reduce.
1056+
if (RegistersToReduce.size() % 2 != 0)
1057+
NewRegs.push_back(RegistersToReduce[RegistersToReduce.size() - 1]);
1058+
1059+
RegistersToReduce = NewRegs;
1060+
}
1061+
9021062
// The concept of the reassociation pass is that these operations can benefit
9031063
// from this kind of transformation:
9041064
//
@@ -938,6 +1098,8 @@ bool TargetInstrInfo::getMachineCombinerPatterns(
9381098
}
9391099
return true;
9401100
}
1101+
if (getAccumulatorReassociationPatterns(Root, Patterns))
1102+
return true;
9411103

9421104
return false;
9431105
}
@@ -949,7 +1111,12 @@ bool TargetInstrInfo::isThroughputPattern(unsigned Pattern) const {
9491111

9501112
CombinerObjective
9511113
TargetInstrInfo::getCombinerObjective(unsigned Pattern) const {
952-
return CombinerObjective::Default;
1114+
switch (Pattern) {
1115+
case MachineCombinerPattern::ACC_CHAIN:
1116+
return CombinerObjective::MustReduceDepth;
1117+
default:
1118+
return CombinerObjective::Default;
1119+
}
9531120
}
9541121

9551122
std::pair<unsigned, unsigned>
@@ -1252,19 +1419,99 @@ void TargetInstrInfo::genAlternativeCodeSequence(
12521419
SmallVectorImpl<MachineInstr *> &DelInstrs,
12531420
DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
12541421
MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
1422+
MachineBasicBlock &MBB = *Root.getParent();
1423+
MachineFunction &MF = *MBB.getParent();
1424+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
12551425

1256-
// Select the previous instruction in the sequence based on the input pattern.
1257-
std::array<unsigned, 5> OperandIndices;
1258-
getReassociateOperandIndices(Root, Pattern, OperandIndices);
1259-
MachineInstr *Prev =
1260-
MRI.getUniqueVRegDef(Root.getOperand(OperandIndices[0]).getReg());
1426+
switch (Pattern) {
1427+
case MachineCombinerPattern::REASSOC_AX_BY:
1428+
case MachineCombinerPattern::REASSOC_AX_YB:
1429+
case MachineCombinerPattern::REASSOC_XA_BY:
1430+
case MachineCombinerPattern::REASSOC_XA_YB: {
1431+
// Select the previous instruction in the sequence based on the input
1432+
// pattern.
1433+
std::array<unsigned, 5> OperandIndices;
1434+
getReassociateOperandIndices(Root, Pattern, OperandIndices);
1435+
MachineInstr *Prev =
1436+
MRI.getUniqueVRegDef(Root.getOperand(OperandIndices[0]).getReg());
1437+
1438+
// Don't reassociate if Prev and Root are in different blocks.
1439+
if (Prev->getParent() != Root.getParent())
1440+
return;
12611441

1262-
// Don't reassociate if Prev and Root are in different blocks.
1263-
if (Prev->getParent() != Root.getParent())
1264-
return;
1442+
reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, OperandIndices,
1443+
InstIdxForVirtReg);
1444+
break;
1445+
}
1446+
case MachineCombinerPattern::ACC_CHAIN: {
1447+
SmallVector<Register, 32> ChainRegs;
1448+
getAccumulatorChain(&Root, ChainRegs);
1449+
unsigned int Depth = ChainRegs.size();
1450+
assert(MaxAccumulatorWidth > 1 &&
1451+
"Max accumulator width set to illegal value");
1452+
unsigned int MaxWidth = Log2_32(Depth) < MaxAccumulatorWidth
1453+
? Log2_32(Depth)
1454+
: MaxAccumulatorWidth;
1455+
1456+
// Walk down the chain and rewrite it as a tree.
1457+
for (auto IndexedReg : llvm::enumerate(llvm::reverse(ChainRegs))) {
1458+
// No need to rewrite the first node, it is already perfect as it is.
1459+
if (IndexedReg.index() == 0)
1460+
continue;
1461+
1462+
MachineInstr *Instr = MRI.getUniqueVRegDef(IndexedReg.value());
1463+
MachineInstrBuilder MIB;
1464+
Register AccReg;
1465+
if (IndexedReg.index() < MaxWidth) {
1466+
// Now we need to create new instructions for the first row.
1467+
AccReg = Instr->getOperand(0).getReg();
1468+
MIB = BuildMI(
1469+
MF, MIMetadata(*Instr),
1470+
TII->get(MRI.getUniqueVRegDef(ChainRegs.back())->getOpcode()),
1471+
AccReg)
1472+
.addReg(Instr->getOperand(2).getReg(),
1473+
getKillRegState(Instr->getOperand(2).isKill()))
1474+
.addReg(Instr->getOperand(3).getReg(),
1475+
getKillRegState(Instr->getOperand(3).isKill()));
1476+
} else {
1477+
// For the remaining cases, we need ot use an output register of one of
1478+
// the newly inserted instuctions as operand 1
1479+
AccReg = Instr->getOperand(0).getReg() == Root.getOperand(0).getReg()
1480+
? MRI.createVirtualRegister(
1481+
MRI.getRegClass(Root.getOperand(0).getReg()))
1482+
: Instr->getOperand(0).getReg();
1483+
assert(IndexedReg.index() - MaxWidth >= 0);
1484+
auto AccumulatorInput =
1485+
ChainRegs[Depth - (IndexedReg.index() - MaxWidth) - 1];
1486+
MIB = BuildMI(MF, MIMetadata(*Instr), TII->get(Instr->getOpcode()),
1487+
AccReg)
1488+
.addReg(AccumulatorInput, getKillRegState(true))
1489+
.addReg(Instr->getOperand(2).getReg(),
1490+
getKillRegState(Instr->getOperand(2).isKill()))
1491+
.addReg(Instr->getOperand(3).getReg(),
1492+
getKillRegState(Instr->getOperand(3).isKill()));
1493+
}
12651494

1266-
reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, OperandIndices,
1267-
InstIdxForVirtReg);
1495+
MIB->setFlags(Instr->getFlags());
1496+
InstIdxForVirtReg.insert(std::make_pair(AccReg, InsInstrs.size()));
1497+
InsInstrs.push_back(MIB);
1498+
DelInstrs.push_back(Instr);
1499+
}
1500+
1501+
SmallVector<Register, 8> RegistersToReduce;
1502+
for (unsigned i = (InsInstrs.size() - MaxWidth); i < InsInstrs.size();
1503+
++i) {
1504+
auto Reg = InsInstrs[i]->getOperand(0).getReg();
1505+
RegistersToReduce.push_back(Reg);
1506+
}
1507+
1508+
while (RegistersToReduce.size() > 1)
1509+
reduceAccumulatorTree(RegistersToReduce, InsInstrs, MF, Root, MRI,
1510+
InstIdxForVirtReg, Root.getOperand(0).getReg());
1511+
1512+
break;
1513+
}
1514+
}
12681515
}
12691516

12701517
MachineTraceStrategy TargetInstrInfo::getMachineCombinerTraceStrategy() const {

0 commit comments

Comments
 (0)