Skip to content

Commit 7886fe6

Browse files
wpangfxbot
authored andcommitted
Reimplement the root nodes computation as a post processing. As
a by-product, this fixes a bug when urb write pairing is on. Remove dead members or functions. Change-Id: I4b8c47e857fdd8c3f5b4de6a04706c97ce8553c7
1 parent 4fbd57a commit 7886fe6

File tree

3 files changed

+27
-113
lines changed

3 files changed

+27
-113
lines changed

visa/LocalScheduler/LocalScheduler_G4IR.cpp

Lines changed: 23 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -296,11 +296,6 @@ void LocalScheduler::localScheduling()
296296
jitInfo->BBNum = i;
297297
}
298298

299-
void G4_BB_Schedule::setOptimumConsecutiveSends()
300-
{
301-
optimumConsecutiveSends = m_options->getuInt32Option(vISA_NumPackedSends);
302-
}
303-
304299
void G4_BB_Schedule::dumpSchedule(G4_BB *bb)
305300
{
306301
const char *asmName = nullptr;
@@ -390,12 +385,10 @@ void G4_BB_Schedule::dumpSchedule(G4_BB *bb)
390385
G4_BB_Schedule::G4_BB_Schedule(G4_Kernel* k, Mem_Manager &m, G4_BB *block,
391386
int dddTimer, int schTimer, uint32_t& totalCycle,
392387
const Options *options, const LatencyTable &LT)
393-
: kernel(k), mem(m), bb(block), curINum(0),
388+
: kernel(k), mem(m), bb(block),
394389
lastCycle(0), sendStallCycle(0),
395390
sequentialCycle(0), m_options(options)
396391
{
397-
setOptimumConsecutiveSends();
398-
399392
// we use local id in the scheduler for determining two instructions' original ordering
400393
bb->resetLocalId();
401394

@@ -995,12 +988,10 @@ DDD::DDD(Mem_Manager &m, G4_BB* bb, const Options *options,
995988
: mem(m), m_options(options), LT(lt), kernel(k)
996989
{
997990
Node* lastBarrier = NULL;
998-
numOfPairs = 0;
999-
numSendsScheduled = 0;
1000991
totalGRFNum = m_options->getuInt32Option(vISA_TotalGRFNum);
1001992
HWthreadsPerEU = m_options->getuInt32Option(vISA_HWThreadNumberPerEU);
1002-
1003993
useMTLatencies = m_options->getOption(vISA_useMultiThreadedLatencies);
994+
bool BTIIsRestrict = m_options->getOption(vISA_ReorderDPSendToDifferentBti);
1004995

1005996
GRF_BUCKET = 0;
1006997
ACC_BUCKET = GRF_BUCKET + totalGRFNum;
@@ -1019,20 +1010,16 @@ DDD::DDD(Mem_Manager &m, G4_BB* bb, const Options *options,
10191010
std::list<G4_INST*>::reverse_iterator iInst(bb->rbegin()), iInstEnd(bb->rend());
10201011
std::vector<BucketDescr> BDvec;
10211012

1022-
bool BTIIsRestrict = m_options->getOption(vISA_ReorderDPSendToDifferentBti);
10231013

10241014
for (int nodeId = (int)(bb->size() - 1); iInst != iInstEnd; ++iInst, nodeId--)
10251015
{
10261016
Node *node = nullptr;
10271017
// If we have a pair of instructions to be mapped on a single DAG node:
10281018
node = new (mem)Node(nodeId, *iInst, depEdgeAllocator, LT);
10291019
allNodes.push_back(node);
1030-
1031-
assert(node->getInstructions()->size() == 1);
1032-
G4_INST *curInst = *node->getInstructions()->begin();
1020+
G4_INST *curInst = node->getInstructions()->front();
10331021
bool hasIndir = false;
10341022
BDvec.clear();
1035-
unsigned NumRegs = m_options->getuInt32Option(vISA_TotalGRFNum);
10361023

10371024

10381025
// Get buckets for all physical registers assigned in curInst
@@ -1058,19 +1045,14 @@ DDD::DDD(Mem_Manager &m, G4_BB* bb, const Options *options,
10581045
for (auto it = LB.begin(), ite = LB.end(); it != ite; ++it) {
10591046
BucketNode *BNode = *it;
10601047
Node* liveNode = BNode->node;
1061-
if (!liveNode->hasPreds())
1048+
if (liveNode->preds.empty())
10621049
{
10631050
createAddEdge(node, liveNode, depType);
10641051
}
10651052
}
10661053
LB.clearAllLive();
1067-
1068-
if (depType == DEP_LABEL)
1054+
if (lastBarrier)
10691055
{
1070-
Roots.push_back(node);
1071-
}
1072-
1073-
if (lastBarrier) {
10741056
createAddEdge(node, lastBarrier, lastBarrier->isBarrier());
10751057
}
10761058

@@ -1174,38 +1156,6 @@ DDD::DDD(Mem_Manager &m, G4_BB* bb, const Options *options,
11741156
// Insert this node into the graph.
11751157
InsertNode(node);
11761158
}
1177-
1178-
// We have no label in this block. Need to initialize roots to traverse the DAG correctly.
1179-
if (Roots.size() == 0)
1180-
{
1181-
// Iterate over all buckets and push all live instructions
1182-
// in to Root list
1183-
for (auto it = LB.begin(), ite = LB.end(); it != ite; ++it)
1184-
{
1185-
Node *curLiveNode = (*it)->node;
1186-
if (!curLiveNode->hasPreds())
1187-
{
1188-
if (std::find(Roots.begin(), Roots.end(), curLiveNode) == Roots.end())
1189-
{
1190-
// Insert Root node only if it hasnt yet
1191-
// been inserted to Root list.
1192-
Roots.push_back(curLiveNode);
1193-
}
1194-
}
1195-
}
1196-
1197-
// It is possible that first inst of a BB is a barrier
1198-
// If the inst does not have any operands then it will not be present in
1199-
// any bucket. Also since it is a barrier, all other buckets will have been
1200-
// emptied. So previous loop will not find any Roots. This will cause
1201-
// list scheduler to have 0-size ready list. The fix is to check whether
1202-
// size of Roots is zero and inserting barrier in to Roots if it is.
1203-
if (Roots.size() == 0) {
1204-
MUST_BE_TRUE(lastBarrier != NULL,
1205-
"Size of Roots list was 0 and no barrier was found");
1206-
Roots.push_back(lastBarrier);
1207-
}
1208-
}
12091159
}
12101160

12111161
// Return TRUE if there is a dependency fromNode->toNode
@@ -1486,7 +1436,6 @@ void DDD::pairTypedWriteOrURBWriteNodes(G4_BB *bb) {
14861436
}
14871437

14881438
// 2. Join nodes that need pairing
1489-
uint32_t cntPairs = 0;
14901439
for (auto&& pair : instrPairs) {
14911440
Node *firstNode = pair.first;
14921441
Node *secondNode = pair.second;
@@ -1500,15 +1449,6 @@ void DDD::pairTypedWriteOrURBWriteNodes(G4_BB *bb) {
15001449
{
15011450
// A. move the deps of seconde node to the first.
15021451
moveDeps(secondNode, firstNode);
1503-
secondNode->setDead();
1504-
1505-
// if second node is not root, first node may not be either
1506-
// as it has inherited second node's predecessors
1507-
auto result2 = std::find(Roots.begin(), Roots.end(), secondNode);
1508-
if (result2 == std::end(Roots))
1509-
{
1510-
Roots.remove(firstNode);
1511-
}
15121452

15131453
// B. We add the second instruction to the first node.
15141454
assert(firstNode->getInstructions()->size() == 1);
@@ -1518,10 +1458,21 @@ void DDD::pairTypedWriteOrURBWriteNodes(G4_BB *bb) {
15181458
{
15191459
firstInstr->setOptionOn(InstOpt_Atomic);
15201460
}
1521-
cntPairs++;
1461+
1462+
// C. Cleanup the paired node.
1463+
secondNode->clear();
15221464
}
15231465
}
1524-
numOfPairs = cntPairs;
1466+
}
1467+
1468+
void DDD::collectRoots()
1469+
{
1470+
Roots.clear();
1471+
for (auto N : allNodes) {
1472+
if (N->preds.empty() && !N->getInstructions()->empty()) {
1473+
Roots.push_back(N);
1474+
}
1475+
}
15251476
}
15261477

15271478
void DDD::setPriority(Node *pred, const Edge &edge)
@@ -1795,7 +1746,7 @@ struct criticalCmp
17951746
else
17961747
{
17971748
return (*n1->getInstructions())[0]->getLocalId()
1798-
> (*n2->getInstructions())[0]->getLocalId();
1749+
> (*n2->getInstructions())[0]->getLocalId();
17991750
}
18001751
}
18011752
}
@@ -1822,11 +1773,9 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule)
18221773
// that is their earliest cycle is >= than the current schedule cycle.
18231774
std::priority_queue<Node *, std::vector<Node *>, earlyCmp> preReadyQueue(SS);
18241775

1825-
for (NODE_LIST_ITER node_it = Roots.begin();
1826-
node_it != Roots.end();
1827-
node_it++)
1828-
{
1829-
preReadyQueue.push(*node_it);
1776+
collectRoots();
1777+
for (auto N : Roots) {
1778+
preReadyQueue.push(N);
18301779
}
18311780

18321781
// The scheduler's clock.
@@ -1869,7 +1818,7 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule)
18691818
// Pointer to node to be scheduled.
18701819
Node *scheduled = readyList.top();
18711820
readyList.pop();
1872-
1821+
18731822
// try to avoid b2b math if possible as there are pipeline stalls
18741823
if (scheduled->getInstructions()->front()->isMath() &&
18751824
lastScheduled && lastScheduled->getInstructions()->front()->isMath())
@@ -1996,7 +1945,6 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule)
19961945
preReadyQueue.push(succ);
19971946
}
19981947
}
1999-
schedule->curINum++;
20001948

20011949
// Increment the scheduler's clock after each scheduled node
20021950
currCycle += scheduled->getOccupancy();

visa/LocalScheduler/LocalScheduler_G4IR.h

Lines changed: 4 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -108,23 +108,12 @@ class Node
108108
// This is used to avoid WAW hazzards during scheduling.
109109
int wSubreg;
110110

111-
// due to coalescing we may end up with dead node, and since we use a vector
112-
// it's not easy to delete it, so we just mark the node as dead.
113-
// it's not needed for correctness as a dead node has no pred/succ and won't be scheduled,
114-
// we just leave it for debugging
115-
bool m_isDead = false;
116-
117111
public:
118112
static const uint32_t SCHED_CYCLE_UNINIT = UINT_MAX;
119113
static const int NO_SUBREG = INT_MAX;
120114
static const int PRIORITY_UNINIT = -1;
121115

122-
// WARNING!!!: hasPreds() will return the right value ONLY before
123-
// the node's predecessors get scheduled (we are reusing the element
124-
// predsNotScheduled for two things to save memory).
125-
bool hasPreds() { return predsNotScheduled != 0; };
126116
unsigned getNodeID() const{ return nodeID; };
127-
128117
bool isTransitiveDep(Node *edgeDst);
129118
bool hasTransitiveEdgeToBarrier;
130119

@@ -146,10 +135,6 @@ class Node
146135
const LatencyTable &LT);
147136
~Node()
148137
{
149-
if (succs.size() > 0)
150-
{
151-
succs.clear();
152-
}
153138
}
154139
void *operator new(size_t sz, Mem_Manager &m) { return m.alloc(sz); }
155140
const std::vector<G4_INST *> *getInstructions() const { return &instVec; }
@@ -174,9 +159,8 @@ class Node
174159
void setWritesToSubreg(int reg) { wSubreg = reg; }
175160
int writesToSubreg() { return wSubreg; }
176161
void addPairInstr(G4_INST *inst) { instVec.push_back(inst); }
162+
void clear() { instVec.clear(); }
177163
void deletePred(Node *pred);
178-
bool isDead() const { return m_isDead; }
179-
void setDead() { m_isDead = true; }
180164

181165
friend class DDD;
182166
friend class G4_BB_Schedule;
@@ -256,10 +240,6 @@ class DDD {
256240
int HWthreadsPerEU;
257241
const LatencyTable LT;
258242

259-
// Counter that holds num of sends scheduled by
260-
// list scheduler just before current instruction.
261-
uint32_t numSendsScheduled;
262-
263243
int GRF_BUCKET;
264244
int ACC_BUCKET;
265245
int FLAG0_BUCKET;
@@ -273,13 +253,14 @@ class DDD {
273253
bool useMTLatencies;
274254
G4_Kernel* kernel;
275255

256+
// Gather all initial ready nodes.
257+
void collectRoots();
258+
276259
public:
277260
typedef std::pair<Node *, Node *> instrPair_t;
278261
typedef std::vector<instrPair_t> instrPairVec_t;
279262
NODE_LIST Nodes, Roots;
280-
NODE_VECT pstOrder, originalOrder;
281263
void moveDeps(Node *fromNode, Node *toNode);
282-
uint32_t numOfPairs;
283264
void pairTypedWriteOrURBWriteNodes(G4_BB *bb);
284265

285266

@@ -296,12 +277,8 @@ class DDD {
296277
}
297278
Nodes.clear();
298279
}
299-
300-
pstOrder.clear();
301-
originalOrder.clear();
302280
}
303281
void *operator new(size_t sz, Mem_Manager &m) { return m.alloc(sz); }
304-
void InsertRoot(Node *root) { Roots.push_back(root); }
305282
void InsertNode(Node *node) { Nodes.push_back(node); }
306283
void dumpNodes(G4_BB *bb);
307284
void dumpDagDot(G4_BB *bb);
@@ -317,12 +294,6 @@ class DDD {
317294
bool getBucketDescrs(Node *inst, std::vector<BucketDescr> &bucketDescrs);
318295

319296
const Options *m_options;
320-
321-
uint32_t getNumSendsScheduled() { return numSendsScheduled; }
322-
void recordConsecutiveSendsScheduled(uint32_t howmany) {
323-
numSendsScheduled = howmany;
324-
}
325-
326297
uint32_t getEdgeLatency_old(Node *node, DepType depT);
327298
uint32_t getEdgeLatency_new(Node *node, DepType depT);
328299
uint32_t getEdgeLatency(Node *node, DepType depT);
@@ -335,12 +306,9 @@ class G4_BB_Schedule {
335306
DDD *ddd;
336307
const Options *m_options;
337308
G4_Kernel *kernel;
338-
uint32_t optimumConsecutiveSends;
339-
void setOptimumConsecutiveSends();
340309

341310
public:
342311
std::vector<Node *> scheduledNodes;
343-
unsigned curINum;
344312
unsigned lastCycle;
345313
unsigned sendStallCycle;
346314
unsigned sequentialCycle;
@@ -352,7 +320,6 @@ class G4_BB_Schedule {
352320
void *operator new(size_t sz, Mem_Manager &m){ return m.alloc(sz); }
353321
// Dumps the schedule
354322
void emit(std::ostream &);
355-
uint32_t getOptimumConsecutiveSends() { return optimumConsecutiveSends; }
356323
void dumpSchedule(G4_BB *bb);
357324
G4_BB *getBB() const { return bb; };
358325
G4_Kernel *getKernel() const { return kernel; }

visa/include/VISAOptions.def

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ DEF_VISA_OPTION(vISA_ReorderDPSendToDifferentBti, ET_BOOL, "-nodpsendreorder", U
140140
DEF_VISA_OPTION(vISA_WAWSubregHazardAvoidance, ET_BOOL, "-noWAWSubregHazardAvoidance", UNUSED, true)
141141
DEF_VISA_OPTION(vISA_useMultiThreadedLatencies, ET_BOOL, "-dontUseMultiThreadedLatencies", UNUSED, true)
142142
DEF_VISA_OPTION(vISA_SchedulerWindowSize, ET_INT32, "-schedulerwindow", "USAGE: -schedulerwindow <window-size>\n", 4096)
143-
DEF_VISA_OPTION(vISA_NumPackedSends, ET_INT32, "-numpackedsends", "USAGE: -numpackedsends <num>\n", 1)
144143
DEF_VISA_OPTION(vISA_UnifiedSendCycle, ET_INT32, "-unifiedSendCycle", "USAGE: -unifiedSendCycle <cycle>\n", 0)
145144
DEF_VISA_OPTION(vISA_HWThreadNumberPerEU, ET_INT32, "-HWThreadNumberPerEU", "USAGE: -HWThreadNumberPerEU <num>\n", 7)
146145
DEF_VISA_OPTION(vISA_NoAtomicSend, ET_BOOL, "-noAtomicSend", UNUSED, false)

0 commit comments

Comments
 (0)