Skip to content

Commit e94a5a4

Browse files
committed
[MachinePipeliner] Make Recurrence MII More Accurate
Current RecMII calculation is bigger than it needs to be. The calculation was refined in this patch.
1 parent a3d4187 commit e94a5a4

File tree

3 files changed

+64
-26
lines changed

3 files changed

+64
-26
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 54 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
197197
}
198198

199199
void createAdjacencyStructure(SwingSchedulerDAG *DAG);
200-
bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
200+
bool circuit(int V, int S, NodeSetType &NodeSets,
201+
const SwingSchedulerDAG *DAG, bool HasBackedge = false);
201202
void unblock(int U);
202203
};
203204

@@ -260,7 +261,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
260261
return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
261262
}
262263

263-
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc = true);
264+
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep,
265+
bool isSucc = true) const;
264266

265267
/// The distance function, which indicates that operation V of iteration I
266268
/// depends on operations U of iteration I-distance.
@@ -311,7 +313,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
311313
void computeNodeOrder(NodeSetType &NodeSets);
312314
void checkValidNodeOrder(const NodeSetType &Circuits) const;
313315
bool schedulePipeline(SMSchedule &Schedule);
314-
bool computeDelta(MachineInstr &MI, unsigned &Delta);
316+
bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
315317
MachineInstr *findDefInLoop(Register Reg);
316318
bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
317319
unsigned &OffsetPos, unsigned &NewBase,
@@ -339,24 +341,58 @@ class NodeSet {
339341
using iterator = SetVector<SUnit *>::const_iterator;
340342

341343
NodeSet() = default;
342-
NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {
343-
Latency = 0;
344-
for (const SUnit *Node : Nodes) {
345-
DenseMap<SUnit *, unsigned> SuccSUnitLatency;
346-
for (const SDep &Succ : Node->Succs) {
347-
auto SuccSUnit = Succ.getSUnit();
348-
if (!Nodes.count(SuccSUnit))
344+
NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
345+
: Nodes(S, E), HasRecurrence(true) {
346+
// Calculate the latency of this node set.
347+
// Example to demonstrate the calculation:
348+
// Given: N0 -> N1 -> N2 -> N0
349+
// Edges:
350+
// (N0 -> N1, 3)
351+
// (N0 -> N1, 5)
352+
// (N1 -> N2, 2)
353+
// (N2 -> N0, 1)
354+
// The total latency which is a lower bound of the recurrence MII is the
355+
// longest patch from N0 back to N0 given only the edges of this node set.
356+
// In this example, the latency is: 5 + 2 + 1 = 8.
357+
//
358+
// Hold a map from each SUnit in the circle to the maximum distance from the
359+
// source node by only considering the nodes.
360+
DenseMap<SUnit *, unsigned> SUnitToDistance;
361+
for (auto *Node : Nodes)
362+
SUnitToDistance[Node] = 0;
363+
364+
for (unsigned I = 1, E = Nodes.size(); I <= E; ++I) {
365+
SUnit *U = Nodes[I - 1];
366+
SUnit *V = Nodes[I % Nodes.size()];
367+
for (const SDep &Succ : U->Succs) {
368+
SUnit *SuccSUnit = Succ.getSUnit();
369+
if (V != SuccSUnit)
349370
continue;
350-
unsigned CurLatency = Succ.getLatency();
351-
unsigned MaxLatency = 0;
352-
if (SuccSUnitLatency.count(SuccSUnit))
353-
MaxLatency = SuccSUnitLatency[SuccSUnit];
354-
if (CurLatency > MaxLatency)
355-
SuccSUnitLatency[SuccSUnit] = CurLatency;
371+
if (SUnitToDistance[U] + Succ.getLatency() > SUnitToDistance[V]) {
372+
SUnitToDistance[V] = SUnitToDistance[U] + Succ.getLatency();
373+
}
356374
}
357-
for (auto SUnitLatency : SuccSUnitLatency)
358-
Latency += SUnitLatency.second;
359375
}
376+
// Handle a back-edge between a store and a load
377+
SUnit *FirstNode = Nodes[0];
378+
SUnit *LastNode = Nodes[Nodes.size() - 1];
379+
380+
if (LastNode->getInstr()->mayStore() && FirstNode->getInstr()->mayLoad()) {
381+
for (auto &PI : LastNode->Preds) {
382+
// If we have an order dep between a load and a store that is
383+
// potentially loop carried then a back-edge exists between the last
384+
// node and the first node that isn't modeled in the DAG. Handle it
385+
// manually by adding 1 to the distance of the last node.
386+
if (PI.getSUnit() != FirstNode || PI.getKind() != SDep::Order ||
387+
!DAG->isLoopCarriedDep(LastNode, PI, false))
388+
continue;
389+
SUnitToDistance[FirstNode] =
390+
std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1);
391+
}
392+
}
393+
394+
// The latency is the distance from the source node to itself.
395+
Latency = SUnitToDistance[Nodes.front()];
360396
}
361397

362398
bool insert(SUnit *SU) { return Nodes.insert(SU); }

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,6 +1706,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
17061706
/// Identify an elementary circuit in the dependence graph starting at the
17071707
/// specified node.
17081708
bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1709+
const SwingSchedulerDAG *DAG,
17091710
bool HasBackedge) {
17101711
SUnit *SV = &SUnits[V];
17111712
bool F = false;
@@ -1719,12 +1720,13 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
17191720
continue;
17201721
if (W == S) {
17211722
if (!HasBackedge)
1722-
NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1723+
NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
17231724
F = true;
17241725
++NumPaths;
17251726
break;
1726-
} else if (!Blocked.test(W)) {
1727-
if (circuit(W, S, NodeSets,
1727+
}
1728+
if (!Blocked.test(W)) {
1729+
if (circuit(W, S, NodeSets, DAG,
17281730
Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
17291731
F = true;
17301732
}
@@ -1767,9 +1769,9 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
17671769
Circuits Cir(SUnits, Topo);
17681770
// Create the adjacency structure.
17691771
Cir.createAdjacencyStructure(this);
1770-
for (int i = 0, e = SUnits.size(); i != e; ++i) {
1772+
for (int I = 0, E = SUnits.size(); I != E; ++I) {
17711773
Cir.reset();
1772-
Cir.circuit(i, i, NodeSets);
1774+
Cir.circuit(I, I, NodeSets, this);
17731775
}
17741776

17751777
// Change the dependences back so that we've created a DAG again.
@@ -2565,7 +2567,7 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
25652567

25662568
/// Return true if we can compute the amount the instruction changes
25672569
/// during each iteration. Set Delta to the amount of the change.
2568-
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2570+
bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
25692571
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
25702572
const MachineOperand *BaseOp;
25712573
int64_t Offset;
@@ -2719,7 +2721,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
27192721
/// potentially. A dependence is loop carried if the destination defines a value
27202722
/// that may be used or defined by the source in a subsequent iteration.
27212723
bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
2722-
bool isSucc) {
2724+
bool isSucc) const {
27232725
if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
27242726
Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
27252727
return false;

llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,8 @@ body: |
222222
; CHECK-NEXT: [[t2MLS1:%[0-9]+]]:rgpr = t2MLS [[t2SDIV1]], [[t2LDRSHi12_1]], [[t2LDRSH_PRE2]], 14 /* CC::al */, $noreg
223223
; CHECK-NEXT: [[t2UXTH2:%[0-9]+]]:rgpr = t2UXTH [[t2SDIV1]], 0, 14 /* CC::al */, $noreg
224224
; CHECK-NEXT: [[t2LDRHi12_9:%[0-9]+]]:gprnopc = t2LDRHi12 [[t2LDRSH_PRE3]], 6, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.uglygep6, align 2, !tbaa !9)
225-
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
226225
; CHECK-NEXT: [[t2UXTH3:%[0-9]+]]:rgpr = t2UXTH [[t2MLS1]], 0, 14 /* CC::al */, $noreg
226+
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_8]], [[t2UXTH2]], 14 /* CC::al */, $noreg, implicit-def $cpsr
227227
; CHECK-NEXT: [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[PHI1]](tied-def 0)
228228
; CHECK-NEXT: t2CMPrr [[t2LDRHi12_9]], [[t2UXTH3]], 14 /* CC::al */, $noreg, implicit-def $cpsr
229229
; CHECK-NEXT: [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[t2ADDri4]], 1, 1 /* CC::ne */, $cpsr, $noreg, implicit [[t2ADDri4]](tied-def 0)

0 commit comments

Comments
 (0)