Skip to content

Commit 62423c1

Browse files
committed
[MachinePipeliner] Fix loop-carried dependencies analysis
In current MachinePipeliner, several loop-carried edges are missed. It can result generating invalid code. At least following loop-carried dependencies can be missed. - Memory dependencies from top to bottom. - Example: ``` for (int i=1; i<n; i++) { a[i] = ...; a[i-1] = ...; } ``` - Store to store dependencies. - Store to load dependencies. - Output (write-after-write) dependencies. - Use of alias analysis results that are valid only in the single iteration. - Example: ``` void f(double * restrict a, double * restrict b); ... for (int i=0; i<n; i++) f(ptr0, ptr1); // will be inlined ``` This patch added these dependencies and fix correctness issues. In addition, the current analysis can add excessive dependencies because loop-carried memory dependence from bottom to top by forward direction (i.e., top to bottom) edge. This patch also removes such dependencies.
1 parent bcc1e58 commit 62423c1

25 files changed

+1734
-558
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
#include "llvm/ADT/STLExtras.h"
4444
#include "llvm/ADT/SetVector.h"
45+
#include "llvm/Analysis/AliasAnalysis.h"
4546
#include "llvm/CodeGen/DFAPacketizer.h"
4647
#include "llvm/CodeGen/MachineDominators.h"
4748
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
@@ -190,6 +191,33 @@ class SwingSchedulerDDGEdge {
190191
bool ignoreDependence(bool IgnoreAnti) const;
191192
};
192193

194+
struct LoopCarriedEdges {
195+
using OutputDep = SmallDenseMap<Register, SmallSetVector<SUnit *, 4>>;
196+
using OrderDep = SmallSetVector<SUnit *, 8>;
197+
using OutputDepsType = DenseMap<SUnit *, OutputDep>;
198+
using OrderDepsType = DenseMap<SUnit *, OrderDep>;
199+
200+
OutputDepsType OutputDeps;
201+
OrderDepsType OrderDeps;
202+
203+
const OutputDep *getOutputDepOrNull(SUnit *Key) const {
204+
auto Ite = OutputDeps.find(Key);
205+
if (Ite == OutputDeps.end())
206+
return nullptr;
207+
return &Ite->second;
208+
}
209+
210+
const OrderDep *getOrderDepOrNull(SUnit *Key) const {
211+
auto Ite = OrderDeps.find(Key);
212+
if (Ite == OrderDeps.end())
213+
return nullptr;
214+
return &Ite->second;
215+
}
216+
217+
void dump(SUnit *SU, const TargetRegisterInfo *TRI,
218+
const MachineRegisterInfo *MRI) const;
219+
};
220+
193221
/// Represents dependencies between instructions. This class is a wrapper of
194222
/// `SUnits` and its dependencies to manipulate back-edges in a natural way.
195223
/// Currently it only supports back-edges via PHI, which are expressed as
@@ -217,8 +245,12 @@ class SwingSchedulerDDG {
217245
SwingSchedulerDDGEdges &getEdges(const SUnit *SU);
218246
const SwingSchedulerDDGEdges &getEdges(const SUnit *SU) const;
219247

248+
void addLoopCarriedEdges(std::vector<SUnit> &SUnits,
249+
const LoopCarriedEdges &LCE);
250+
220251
public:
221-
SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU);
252+
SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU,
253+
const LoopCarriedEdges &LCE);
222254

223255
const EdgesType &getInEdges(const SUnit *SU) const;
224256

@@ -285,22 +317,14 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
285317
BitVector Blocked;
286318
SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
287319
SmallVector<SmallVector<int, 4>, 16> AdjK;
288-
// Node to Index from ScheduleDAGTopologicalSort
289-
std::vector<int> *Node2Idx;
320+
SmallVector<BitVector, 16> LoopCarried;
290321
unsigned NumPaths = 0u;
291-
static unsigned MaxPaths;
292322

293323
public:
294-
Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
295-
: SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
296-
Node2Idx = new std::vector<int>(SUs.size());
297-
unsigned Idx = 0;
298-
for (const auto &NodeNum : Topo)
299-
Node2Idx->at(NodeNum) = Idx++;
300-
}
324+
Circuits(std::vector<SUnit> &SUs)
325+
: SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {}
301326
Circuits &operator=(const Circuits &other) = delete;
302327
Circuits(const Circuits &other) = delete;
303-
~Circuits() { delete Node2Idx; }
304328

305329
/// Reset the data structures used in the circuit algorithm.
306330
void reset() {
@@ -310,9 +334,9 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
310334
NumPaths = 0;
311335
}
312336

313-
void createAdjacencyStructure(SwingSchedulerDAG *DAG);
337+
void createAdjacencyStructure(const SwingSchedulerDDG *DDG);
314338
bool circuit(int V, int S, NodeSetType &NodeSets,
315-
const SwingSchedulerDAG *DAG, bool HasBackedge = false);
339+
const SwingSchedulerDDG *DDG, bool HasLoopCarriedEdge = false);
316340
void unblock(int U);
317341
};
318342

@@ -366,7 +390,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
366390
return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
367391
}
368392

369-
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const;
393+
bool hasLoopCarriedMemDep(const MachineInstr *Src, const MachineInstr *Dst,
394+
BatchAAResults *BAA) const;
370395

371396
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
372397

@@ -391,7 +416,9 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
391416
const SwingSchedulerDDG *getDDG() const { return DDG.get(); }
392417

393418
private:
394-
void addLoopCarriedDependences(AAResults *AA);
419+
LoopCarriedEdges addLoopCarriedDependences(AAResults *AA);
420+
AliasResult::Kind checkLoopCarriedMemDep(const MachineInstr *Src,
421+
const MachineInstr *Dst) const;
395422
void updatePhiDependences();
396423
void changeDependences();
397424
unsigned calculateResMII();
@@ -409,7 +436,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
409436
void computeNodeOrder(NodeSetType &NodeSets);
410437
void checkValidNodeOrder(const NodeSetType &Circuits) const;
411438
bool schedulePipeline(SMSchedule &Schedule);
412-
bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
439+
bool computeDelta(const MachineInstr &MI, unsigned &Delta) const;
413440
MachineInstr *findDefInLoop(Register Reg);
414441
bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
415442
unsigned &OffsetPos, unsigned &NewBase,
@@ -437,7 +464,7 @@ class NodeSet {
437464
using iterator = SetVector<SUnit *>::const_iterator;
438465

439466
NodeSet() = default;
440-
NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
467+
NodeSet(iterator S, iterator E, const SwingSchedulerDDG *DDG)
441468
: Nodes(S, E), HasRecurrence(true) {
442469
// Calculate the latency of this node set.
443470
// Example to demonstrate the calculation:
@@ -453,7 +480,6 @@ class NodeSet {
453480
//
454481
// Hold a map from each SUnit in the circle to the maximum distance from the
455482
// source node by only considering the nodes.
456-
const SwingSchedulerDDG *DDG = DAG->getDDG();
457483
DenseMap<SUnit *, unsigned> SUnitToDistance;
458484
for (auto *Node : Nodes)
459485
SUnitToDistance[Node] = 0;
@@ -470,22 +496,6 @@ class NodeSet {
470496
}
471497
}
472498
}
473-
// Handle a back-edge in loop carried dependencies
474-
SUnit *FirstNode = Nodes[0];
475-
SUnit *LastNode = Nodes[Nodes.size() - 1];
476-
477-
for (auto &PI : DDG->getInEdges(LastNode)) {
478-
// If we have an order dep that is potentially loop carried then a
479-
// back-edge exists between the last node and the first node that isn't
480-
// modeled in the DAG. Handle it manually by adding 1 to the distance of
481-
// the last node.
482-
if (PI.getSrc() != FirstNode || !PI.isOrderDep() ||
483-
!DAG->isLoopCarriedDep(PI))
484-
continue;
485-
SUnitToDistance[FirstNode] =
486-
std::max(SUnitToDistance[FirstNode], SUnitToDistance[LastNode] + 1);
487-
}
488-
489499
// The latency is the distance from the source node to itself.
490500
Latency = SUnitToDistance[Nodes.front()];
491501
}

0 commit comments

Comments
 (0)