Add heuristic to support issues of multiple different pipelines

bcheng0127 · igcbot · commit f5c353771ec6 · 2024-06-11T21:01:19.000+02:00
Occupancy used to be the step for the instruction issue. However, will
multiple pipelines, the issue step cycle can be 1 if adjcent
instructions are going to different pipelines.
diff --git a/visa/G4_IR.cpp b/visa/G4_IR.cpp
@@ -854,7 +854,12 @@ bool G4_INST::hasNoPipe() const {
   if (op == G4_wait || op == G4_halt || op == G4_nop) {
     return true;
   }
-
+  if (op == G4_label) {
+    return true;
+  }
+  if (op == G4_intrinsic) {
+    return true;
+  }
   if (op == G4_sync_fence || op == G4_sync_nop || op == G4_sync_allrd ||
       op == G4_sync_allwr) {
     return true;
diff --git a/visa/G4_IR.hpp b/visa/G4_IR.hpp
@@ -92,6 +92,7 @@ enum SB_INST_PIPE {
   PIPE_MATH = 4,
   PIPE_DPAS = 6,
   PIPE_SEND = 7,
+  PIPE_ALL = 8
 };
 // forward declaration for the binary of an instruction
 class BinInst;
diff --git a/visa/LocalScheduler/LocalScheduler_G4IR.cpp b/visa/LocalScheduler/LocalScheduler_G4IR.cpp
@@ -1309,7 +1309,9 @@ DDD::DDD(G4_BB *bb, const LatencyTable &lt, G4_Kernel *k, PointsToAnalysis &p)
   TOTAL_BUCKETS = OTHER_ARF_BUCKET + 1;
 
   LiveBuckets LB(this, GRF_BUCKET, TOTAL_BUCKETS);
-
+  for (int i = 0; i < PIPE_ALL; i++) {
+    latestInstOfEachPipe[i] = nullptr;
+  }
   // Building the graph in reverse relative to the original instruction
   // order, to naturally take care of the liveness of operands.
   std::list<G4_INST *>::reverse_iterator iInst(bb->rbegin()),
@@ -2497,6 +2499,34 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule) {
     }
   };
 
+  auto getStepCycle = [&](Node *n, uint32_t currCycle) -> uint32_t {
+    uint32_t stepCycle = 1;
+    for (unsigned i = PIPE_INT; i < PIPE_ALL; i++) {
+      if (latestInstOfEachPipe[i] == nullptr) {
+        continue;
+      }
+
+      if ((latestInstOfEachPipe[i]->schedTime +
+           latestInstOfEachPipe[i]->getOccupancy()) <= currCycle) {
+        latestInstOfEachPipe[i] = nullptr;
+        continue;
+      }
+
+      if (i == n->instPipe) {
+        stepCycle = std::max(
+            stepCycle, latestInstOfEachPipe[i]->schedTime +
+                           latestInstOfEachPipe[i]->getOccupancy() - currCycle);
+      } else {
+        if (latestInstOfEachPipe[i]->schedTime + 1 > currCycle) {
+          stepCycle = std::max(stepCycle, latestInstOfEachPipe[i]->schedTime +
+                                              1 - currCycle);
+        }
+      }
+    }
+
+    return stepCycle;
+  };
+
   auto updateForScheduled = [&](Node *scheduled) {
     // Append the scheduled node to the end of the schedule.
     schedule->scheduledNodes.push_back(scheduled);
@@ -2519,9 +2549,15 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule) {
 
 
     updateForSucc(scheduled);
+    if (getOptions()->getOption(vISA_multiplePipeSched)) {
+      // Increment the scheduler's clock after each scheduled node
+      currCycle += getStepCycle(scheduled, currCycle);
 
-    // Increment the scheduler's clock after each scheduled node
-    currCycle += scheduled->getOccupancy();
+      latestInstOfEachPipe[scheduled->instPipe] = scheduled;
+    } else {
+      // Increment the scheduler's clock after each scheduled node
+      currCycle += scheduled->getOccupancy();
+    }
   };
 
   auto scheduleForSuppression = [&]() -> bool {
@@ -2596,6 +2632,32 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule) {
     return scheduled;
   };
 
+  auto getHigestOccupancyStallPipe = [&](uint32_t currCycle) -> SB_INST_PIPE {
+    SB_INST_PIPE pipe = PIPE_NONE;
+    uint32_t mostOccupancyStallCycle = 0;
+    for (unsigned i = PIPE_INT; i < PIPE_ALL; i++) {
+      if (latestInstOfEachPipe[i] == nullptr) {
+        continue;
+      }
+
+      if ((latestInstOfEachPipe[i]->schedTime +
+           latestInstOfEachPipe[i]->getOccupancy()) <= currCycle) {
+        latestInstOfEachPipe[i] = nullptr;
+        continue;
+      }
+
+      if (latestInstOfEachPipe[i]->schedTime +
+              latestInstOfEachPipe[i]->getOccupancy() >
+          mostOccupancyStallCycle) {
+        pipe = (SB_INST_PIPE)i;
+        mostOccupancyStallCycle = latestInstOfEachPipe[i]->schedTime +
+                                  latestInstOfEachPipe[i]->getOccupancy();
+      }
+    }
+
+    return pipe;
+  };
+
   // Try to avoid b2b math if possible as there are pipeline stalls.
   auto scheduleForB2BMathReduction = [&](Node *scheduled) -> bool {
     return !readyList.empty() && lastScheduled &&
@@ -2604,7 +2666,7 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule) {
   };
 
   auto applyB2BMathReductionHeuristic = [&](Node *scheduled,
-      Node *lastScheduled) -> Node * {
+                                            Node *lastScheduled) -> Node * {
     // pick another node on the ready list if it's not math and won't cause
     // a longer stall to save compile time we currently limit search size to 2
     std::vector<Node *> popped;
@@ -2629,6 +2691,28 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule) {
     return scheduled;
   };
 
+  auto applyMultiplePipelineHeuristic = [&](Node *scheduled, SB_INST_PIPE stallPipe) -> Node * {
+    // pick another node on the ready list if it's not math and won't cause
+    // a longer stall to save compile time we currently limit search size to 2
+    std::vector<Node *> popped;
+    for (size_t i = 0; i < readyList.size(); ++i) {
+      Node *next = readyList.top();
+      readyList.pop();
+      if (next->instPipe != stallPipe) {
+        readyList.push(scheduled);
+        scheduled = next;
+        break;
+      } else {
+        // keep searching
+        popped.push_back(next);
+      }
+    }
+    for (auto nodes : popped) {
+      readyList.push(nodes);
+    }
+    return scheduled;
+  };
+
   // Avoid WAW subreg hazard by skipping nodes that cause a WAW subreg
   // hazard with the lastScheduled instruction.
   auto scheduleForWAWSubregHazardReduction = [&]() -> bool {
@@ -2829,9 +2913,17 @@ uint32_t DDD::listSchedule(G4_BB_Schedule *schedule) {
       heuCandidate =
           applyBankConflictReductionHeuristic(scheduled, lastScheduled);
     }
-    if (!heuCandidate && scheduleForB2BMathReduction(scheduled)) {
-      heuCandidate =
-          applyB2BMathReductionHeuristic(scheduled, lastScheduled);
+    if (!heuCandidate) {
+      if (getOptions()->getOption(vISA_multiplePipeSched)) {
+        auto occupancyPipe = getHigestOccupancyStallPipe(currCycle);
+        if (occupancyPipe != PIPE_NONE &&
+            occupancyPipe == scheduled->instPipe) {
+          heuCandidate =
+              applyMultiplePipelineHeuristic(scheduled, occupancyPipe);
+        }
+      } else if (scheduleForB2BMathReduction(scheduled)){
+        heuCandidate = applyB2BMathReductionHeuristic(scheduled, lastScheduled);
+      }
     }
     if (!heuCandidate && scheduleForWAWSubregHazardReduction()) {
       heuCandidate =
@@ -2958,6 +3050,10 @@ Node::Node(uint32_t id, G4_INST *inst, Edge_Allocator &depEdgeAllocator,
   priority = occupancy;
 
   barrier = CheckBarrier(inst);
+
+  if (!inst->isLabel()) {
+    instPipe = inst->getInstructionPipeXe();
+  }
 }
 
 void LocalScheduler::EmitNode(Node *node) {
diff --git a/visa/LocalScheduler/LocalScheduler_G4IR.h b/visa/LocalScheduler/LocalScheduler_G4IR.h
@@ -104,6 +104,7 @@ class Node {
   unsigned getNodeID() const { return nodeID; };
 
   uint32_t schedTime = 0;
+  SB_INST_PIPE instPipe = PIPE_NONE;
 
   // Number of predecessor nodes not scheduled
   uint16_t predsNotScheduled = 0;
@@ -322,6 +323,7 @@ class DDD {
   typedef std::pair<Node *, Node *> instrPair_t;
   typedef std::vector<instrPair_t> instrPairVec_t;
   NODE_LIST Roots;
+  Node *latestInstOfEachPipe[PIPE_ALL];
   NodeAlloc NodeAllocator;
   void moveDeps(Node *fromNode, Node *toNode);
   void pairTypedWriteOrURBWriteNodes(G4_BB *bb);
diff --git a/visa/include/VISAOptionsDefs.h b/visa/include/VISAOptionsDefs.h
@@ -778,3 +778,4 @@ DEF_VISA_OPTION(
     "HW thread scheduling policy: 0: single thread first; 1: round robin;", 0)
 DEF_VISA_OPTION(vISA_SendQueueEntries, ET_INT32, "-sendQueueEntries", UNUSED, 0)
 DEF_VISA_OPTION(vISA_SendQueueSched, ET_BOOL, "-sendQueueSched", UNUSED, false)
+DEF_VISA_OPTION(vISA_multiplePipeSched, ET_BOOL, "-multiplePipeSched", UNUSED, false)