Reimplement the root nodes computation as a post processing. As

wpan · gfxbot · commit 7886fe60ff41 · 2018-10-12T12:13:29.000-07:00
a by-product, this fixes a bug when urb write pairing is on.
Remove dead members or functions.

Change-Id: I4b8c47e857fdd8c3f5b4de6a04706c97ce8553c7
diff --git a/visa/LocalScheduler/LocalScheduler_G4IR.cpp b/visa/LocalScheduler/LocalScheduler_G4IR.cpp
@@ -296,11 +296,6 @@ void LocalScheduler::localScheduling()
     jitInfo->BBNum = i;
 }
 
-void G4_BB_Schedule::setOptimumConsecutiveSends()
-{
-    optimumConsecutiveSends = m_options->getuInt32Option(vISA_NumPackedSends);
-}
-
 void G4_BB_Schedule::dumpSchedule(G4_BB *bb)
 {
     const char *asmName = nullptr;
@@ -390,12 +385,10 @@ void G4_BB_Schedule::dumpSchedule(G4_BB *bb)
 G4_BB_Schedule::G4_BB_Schedule(G4_Kernel* k, Mem_Manager &m, G4_BB *block,
     int dddTimer, int schTimer, uint32_t& totalCycle,
     const Options *options, const LatencyTable &LT)
-    : kernel(k), mem(m), bb(block), curINum(0),
+    : kernel(k), mem(m), bb(block),
     lastCycle(0), sendStallCycle(0),
     sequentialCycle(0), m_options(options)
 {
-    setOptimumConsecutiveSends();
-
     // we use local id in the scheduler for determining two instructions' original ordering
     bb->resetLocalId();
 
@@ -995,12 +988,10 @@ DDD::DDD(Mem_Manager &m, G4_BB* bb, const Options *options,
     : mem(m), m_options(options), LT(lt), kernel(k)
 {
     Node* lastBarrier = NULL;
-    numOfPairs = 0;
-    numSendsScheduled = 0;
     totalGRFNum = m_options->getuInt32Option(vISA_TotalGRFNum);
     HWthreadsPerEU = m_options->getuInt32Option(vISA_HWThreadNumberPerEU);
-
     useMTLatencies = m_options->getOption(vISA_useMultiThreadedLatencies);
+    bool BTIIsRestrict = m_options->getOption(vISA_ReorderDPSendToDifferentBti);
 
     GRF_BUCKET = 0;
     ACC_BUCKET = GRF_BUCKET + totalGRFNum;
@@ -1019,20 +1010,16 @@ DDD::DDD(Mem_Manager &m, G4_BB* bb, const Options *options,
     std::list<G4_INST*>::reverse_iterator iInst(bb->rbegin()), iInstEnd(bb->rend());
     std::vector<BucketDescr> BDvec;
 
-    bool BTIIsRestrict = m_options->getOption(vISA_ReorderDPSendToDifferentBti);
 
     for (int nodeId = (int)(bb->size() - 1); iInst != iInstEnd; ++iInst, nodeId--)
     {
         Node *node = nullptr;
         // If we have a pair of instructions to be mapped on a single DAG node:
         node = new (mem)Node(nodeId, *iInst, depEdgeAllocator, LT);
         allNodes.push_back(node);
-
-        assert(node->getInstructions()->size() == 1);
-        G4_INST *curInst = *node->getInstructions()->begin();
+        G4_INST *curInst = node->getInstructions()->front();
         bool hasIndir = false;
         BDvec.clear();
-        unsigned NumRegs = m_options->getuInt32Option(vISA_TotalGRFNum);
 
 
         // Get buckets for all physical registers assigned in curInst
@@ -1058,19 +1045,14 @@ DDD::DDD(Mem_Manager &m, G4_BB* bb, const Options *options,
             for (auto it = LB.begin(), ite = LB.end(); it != ite; ++it) {
                 BucketNode *BNode = *it;
                 Node* liveNode = BNode->node;
-                if (!liveNode->hasPreds())
+                if (liveNode->preds.empty())
                 {
                     createAddEdge(node, liveNode, depType);
                 }
             }
             LB.clearAllLive();
-
-            if (depType == DEP_LABEL)
+            if (lastBarrier)
             {
-                Roots.push_back(node);
-            }
-
-            if (lastBarrier) {
                 createAddEdge(node, lastBarrier, lastBarrier->isBarrier());
             }
 
@@ -1174,38 +1156,6 @@ DDD::DDD(Mem_Manager &m, G4_BB* bb, const Options *options,
         // Insert this node into the graph.
         InsertNode(node);
     }
-
-    // We have no label in this block. Need to initialize roots to traverse the DAG correctly.
-    if (Roots.size() == 0)
-    {
-        // Iterate over all buckets and push all live instructions
-        // in to Root list
-        for (auto it = LB.begin(), ite = LB.end(); it != ite; ++it)
-        {
-            Node *curLiveNode = (*it)->node;
-            if (!curLiveNode->hasPreds())
-            {
-                if (std::find(Roots.begin(), Roots.end(), curLiveNode) == Roots.end())
-                {
-                    // Insert Root node only if it hasnt yet
-                    // been inserted to Root list.
-                    Roots.push_back(curLiveNode);
-                }
-            }
-        }
-
-        // It is possible that first inst of a BB is a barrier
-        // If the inst does not have any operands then it will not be present in
-        // any bucket. Also since it is a barrier, all other buckets will have been
-        // emptied. So previous loop will not find any Roots. This will cause
-        // list scheduler to have 0-size ready list. The fix is to check whether
-        // size of Roots is zero and inserting barrier in to Roots if it is.
-        if (Roots.size() == 0) {
-            MUST_BE_TRUE(lastBarrier != NULL,
-                "Size of Roots list was 0 and no barrier was found");
-            Roots.push_back(lastBarrier);
-        }
-    }
 }
 
 // Return TRUE if there is a dependency fromNode->toNode
@@ -1486,7 +1436,6 @@ void DDD::pairTypedWriteOrURBWriteNodes(G4_BB *bb) {
     }
 
     // 2. Join nodes that need pairing
-    uint32_t cntPairs = 0;
     for (auto&& pair : instrPairs) {
         Node *firstNode = pair.first;
         Node *secondNode = pair.second;
@@ -1500,15 +1449,6 @@ void DDD::pairTypedWriteOrURBWriteNodes(G4_BB *bb) {
         {
             // A. move the deps of seconde node to the first.
             moveDeps(secondNode, firstNode);
-            secondNode->setDead();
-
-            // if second node is not root, first node may not be either 
-            // as it has inherited second node's predecessors
-            auto result2 = std::find(Roots.begin(), Roots.end(), secondNode);
-            if (result2 == std::end(Roots))
-            {
-                Roots.remove(firstNode);
-            }
 
             // B. We add the second instruction to the first node.
             assert(firstNode->getInstructions()->size() == 1);
@@ -1518,10 +1458,21 @@ void DDD::pairTypedWriteOrURBWriteNodes(G4_BB *bb) {
             {
                 firstInstr->setOptionOn(InstOpt_Atomic);
             }
-            cntPairs++;
+
+            // C. Cleanup the paired node.
+            secondNode->clear();
         }
     }
-    numOfPairs = cntPairs;
+}
+
+void DDD::collectRoots()
+{
+    Roots.clear();
+    for (auto N : allNodes) {
+        if (N->preds.empty() && !N->getInstructions()->empty()) {
+            Roots.push_back(N);
+       }
+    }
 }
 
 void DDD::setPriority(Node *pred, const Edge &edge)
@@ -1795,7 +1746,7 @@ struct criticalCmp
             else
             {
                 return (*n1->getInstructions())[0]->getLocalId()
-            > (*n2->getInstructions())[0]->getLocalId();
+                     > (*n2->getInstructions())[0]->getLocalId();
             }
         }
     }
@@ -1822,11 +1773,9 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule)
     // that is their earliest cycle is >= than the current schedule cycle.
     std::priority_queue<Node *, std::vector<Node *>, earlyCmp> preReadyQueue(SS);
 
-    for (NODE_LIST_ITER node_it = Roots.begin();
-        node_it != Roots.end();
-        node_it++)
-    {
-        preReadyQueue.push(*node_it);
+    collectRoots();
+    for (auto N : Roots) {
+        preReadyQueue.push(N);
     }
 
     // The scheduler's clock.
@@ -1869,7 +1818,7 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule)
         // Pointer to node to be scheduled.
         Node *scheduled = readyList.top();
         readyList.pop();
-        	
+
         // try to avoid b2b math if possible as there are pipeline stalls
         if (scheduled->getInstructions()->front()->isMath() &&
             lastScheduled && lastScheduled->getInstructions()->front()->isMath())
@@ -1996,7 +1945,6 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule)
                 preReadyQueue.push(succ);
             }
         }
-        schedule->curINum++;
 
         // Increment the scheduler's clock after each scheduled node
         currCycle += scheduled->getOccupancy();
diff --git a/visa/LocalScheduler/LocalScheduler_G4IR.h b/visa/LocalScheduler/LocalScheduler_G4IR.h
@@ -108,23 +108,12 @@ class Node
     // This is used to avoid WAW hazzards during scheduling.
     int wSubreg;
 
-    // due to coalescing we may end up with dead node, and since we use a vector
-    // it's not easy to delete it, so we just mark the node as dead.
-    // it's not needed for correctness as a dead node has no pred/succ and won't be scheduled,
-    // we just leave it for debugging
-    bool m_isDead = false;
-
 public:
     static const uint32_t SCHED_CYCLE_UNINIT = UINT_MAX;
     static const int NO_SUBREG = INT_MAX;
     static const int PRIORITY_UNINIT = -1;
 
-    // WARNING!!!: hasPreds() will return the right value ONLY before
-    // the node's predecessors get scheduled (we are reusing the element
-    // predsNotScheduled for two things to save memory).
-    bool hasPreds() { return predsNotScheduled != 0;  };
     unsigned getNodeID() const{ return nodeID; };
-
     bool isTransitiveDep(Node *edgeDst);
     bool hasTransitiveEdgeToBarrier;
 
@@ -146,10 +135,6 @@ class Node
         const LatencyTable &LT);
     ~Node()
     {
-        if (succs.size() > 0)
-        {
-            succs.clear();
-        }
     }
     void *operator new(size_t sz, Mem_Manager &m) { return m.alloc(sz); }
     const std::vector<G4_INST *> *getInstructions() const { return &instVec; }
@@ -174,9 +159,8 @@ class Node
     void setWritesToSubreg(int reg) { wSubreg = reg; }
     int writesToSubreg() { return wSubreg; }
     void addPairInstr(G4_INST *inst) { instVec.push_back(inst); }
+    void clear() { instVec.clear(); }
     void deletePred(Node *pred);
-    bool isDead() const { return m_isDead; }
-    void setDead() { m_isDead = true; }
 
     friend class DDD;
     friend class G4_BB_Schedule;
@@ -256,10 +240,6 @@ class DDD {
     int HWthreadsPerEU;
     const LatencyTable LT;
 
-    // Counter that holds num of sends scheduled by
-    // list scheduler just before current instruction.
-    uint32_t numSendsScheduled;
-
     int GRF_BUCKET;
     int ACC_BUCKET;
     int FLAG0_BUCKET;
@@ -273,13 +253,14 @@ class DDD {
     bool useMTLatencies;
     G4_Kernel* kernel;
 
+    // Gather all initial ready nodes.
+    void collectRoots();
+
 public:
     typedef std::pair<Node *, Node *> instrPair_t;
     typedef std::vector<instrPair_t> instrPairVec_t;
     NODE_LIST Nodes, Roots;
-    NODE_VECT pstOrder, originalOrder;
     void moveDeps(Node *fromNode, Node *toNode);
-    uint32_t numOfPairs;
     void pairTypedWriteOrURBWriteNodes(G4_BB *bb);
 
 
@@ -296,12 +277,8 @@ class DDD {
             }
             Nodes.clear();
         }
-
-        pstOrder.clear();
-        originalOrder.clear();
     }
     void *operator new(size_t sz, Mem_Manager &m) { return m.alloc(sz); }
-    void InsertRoot(Node *root) { Roots.push_back(root); }
     void InsertNode(Node *node) { Nodes.push_back(node); }
     void dumpNodes(G4_BB *bb);
     void dumpDagDot(G4_BB *bb);
@@ -317,12 +294,6 @@ class DDD {
     bool getBucketDescrs(Node *inst, std::vector<BucketDescr> &bucketDescrs);
 
     const Options *m_options;
-
-    uint32_t getNumSendsScheduled() { return numSendsScheduled; }
-    void recordConsecutiveSendsScheduled(uint32_t howmany) {
-      numSendsScheduled = howmany;
-    }
-
     uint32_t getEdgeLatency_old(Node *node, DepType depT);
     uint32_t getEdgeLatency_new(Node *node, DepType depT);
     uint32_t getEdgeLatency(Node *node, DepType depT);
@@ -335,12 +306,9 @@ class G4_BB_Schedule {
     DDD *ddd;
     const Options *m_options;
     G4_Kernel *kernel;
-    uint32_t optimumConsecutiveSends;
-    void setOptimumConsecutiveSends();
 
 public:
     std::vector<Node *> scheduledNodes;
-    unsigned curINum;
     unsigned lastCycle;
     unsigned sendStallCycle;
     unsigned sequentialCycle;
@@ -352,7 +320,6 @@ class G4_BB_Schedule {
     void *operator new(size_t sz, Mem_Manager &m){ return m.alloc(sz); }
     // Dumps the schedule
     void emit(std::ostream &);
-    uint32_t getOptimumConsecutiveSends() { return optimumConsecutiveSends; }
     void dumpSchedule(G4_BB *bb);
     G4_BB *getBB() const { return bb; };
     G4_Kernel *getKernel() const { return kernel; }
diff --git a/visa/include/VISAOptions.def b/visa/include/VISAOptions.def
@@ -140,7 +140,6 @@ DEF_VISA_OPTION(vISA_ReorderDPSendToDifferentBti, ET_BOOL, "-nodpsendreorder", U
 DEF_VISA_OPTION(vISA_WAWSubregHazardAvoidance,    ET_BOOL, "-noWAWSubregHazardAvoidance", UNUSED, true)
 DEF_VISA_OPTION(vISA_useMultiThreadedLatencies,   ET_BOOL, "-dontUseMultiThreadedLatencies", UNUSED, true)
 DEF_VISA_OPTION(vISA_SchedulerWindowSize,         ET_INT32, "-schedulerwindow", "USAGE: -schedulerwindow <window-size>\n", 4096)
-DEF_VISA_OPTION(vISA_NumPackedSends,    ET_INT32, "-numpackedsends",        "USAGE: -numpackedsends <num>\n",     1)
 DEF_VISA_OPTION(vISA_UnifiedSendCycle,  ET_INT32, "-unifiedSendCycle",      "USAGE: -unifiedSendCycle <cycle>\n", 0)
 DEF_VISA_OPTION(vISA_HWThreadNumberPerEU, ET_INT32, "-HWThreadNumberPerEU", "USAGE: -HWThreadNumberPerEU <num>\n",  7)
 DEF_VISA_OPTION(vISA_NoAtomicSend, ET_BOOL, "-noAtomicSend", UNUSED, false)