Improve RA optimization for scalar payload in send

kychendev · igcbot · commit f8be1152bc54 · 2023-07-05T18:21:43.000+02:00
For some send instruction, scalar payload such as address operand in
block load is defined as GRF aligned in vISA. This causes more RA
constraints which may result in spilling. This patch improves RA
optimization to relax the alignment restriction for the original
variable and replace local reference with temp aligned variable to
mitigate the problem.
diff --git a/visa/GraphColor.cpp b/visa/GraphColor.cpp
@@ -10089,6 +10089,12 @@ int GlobalRA::coloringRegAlloc() {
 
           // Re-run GRA loop if changes were made to IR
           rerunGRA |= split.getChangesMade();
+          kernel.dumpToFile("after.Split_Aligned_Scalar." + std::to_string(iterationNo));
+#ifndef DLL_MODE
+          if (stopAfter("Split_Aligned_Scalar")) {
+            return VISA_EARLY_EXIT;
+          }
+#endif // DLL_MODE
         }
 
         // Calculate the spill caused by send to decide if global splitting is
diff --git a/visa/SplitAlignedScalars.cpp b/visa/SplitAlignedScalars.cpp
@@ -75,8 +75,11 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
             Data.allowed = false;
           }
 
-          if (dst->getTypeSize() != dstTopDcl->getByteSize()) {
-            // we require that dst opnd writes complete topdcl
+          if ((dst->getTypeSize() != dstTopDcl->getByteSize()) &&
+              !(dst->getTypeSize() == 4 && dstTopDcl->getByteSize() == 8)) {
+            // Disallow case where dst opnd does not write complete topdcl,
+            // except QW opnd which may be A64 address computed separately
+            // with low/hi DW opnds
             Data.allowed = false;
           }
 
@@ -89,10 +92,11 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
           auto dstDcl =
               dst->asDstRegRegion()->getBase()->asRegVar()->getDeclare();
           if (dstDcl->getAliasDeclare()) {
-            // disallow case where topdcl is scalar, but alias dcl
-            // is not a scalar as it may be smaller in size. for eg,
-            // topdcl may be :uq and alias may be of type :ud.
-            if (dstDcl->getByteSize() != dstTopDcl->getByteSize())
+            // Disallow case where dst alias dcl size is different from
+            // top dcl, except QW opnd which may be A64 address computed
+            // separately with low/hi DW opnds
+            if ((dstDcl->getByteSize() != dstTopDcl->getByteSize()) &&
+                !(dstDcl->getByteSize() == 4 && dstTopDcl->getByteSize() == 8))
               Data.allowed = false;
           }
 
@@ -102,6 +106,9 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
           }
 
           if (!inst->isSend()) {
+            if (inst->getExecSize() != 1) {
+              Data.allowed = false;
+            }
             // check whether dst type size != src type size
             // disallow optimization if dst type is different
             // than src as that entails alignmment requirements
@@ -128,6 +135,10 @@ std::vector<G4_Declare *> SplitAlignedScalars::gatherCandidates() {
               Data.lastUse = lexId;
 
             if (!inst->isSend()) {
+              if (!(src->asSrcRegRegion()->isScalar())) {
+                Data.allowed = false;
+              }
+
               // mixed types have alignment requirements
               if (dst->getTypeSize() != src->getTypeSize())
                 Data.allowed = false;
@@ -212,9 +223,8 @@ void SplitAlignedScalars::pruneCandidates(
   std::for_each(candidates.begin(), candidates.end(),
                 [&](G4_Declare *dcl) { candidateList.push_back(dcl); });
 
-  // First re-order candidates based on spill cost in descending order
+  // First re-order candidates based on spill cost in ascending order
   candidateList.sort(compareSpillCost);
-  std::reverse(candidateList.begin(), candidateList.end());
 
   for (auto it = candidateList.begin(); it != candidateList.end();) {
     auto dcl = *it;
@@ -248,7 +258,9 @@ void SplitAlignedScalars::pruneCandidates(
   candidates.clear();
   unsigned int totalMovsNeeded = 0;
   unsigned int estimatedInstCount = estimateInstCount();
-  for (auto candidate : candidateList) {
+
+  for (auto ci = candidateList.rbegin(); ci != candidateList.rend(); ++ci) {
+    const auto &candidate = *ci;
     bool isCandidate = false;
     auto numMovsNeeded = computeNumMovs(candidate);
 
@@ -274,7 +286,9 @@ void SplitAlignedScalars::pruneCandidates(
 }
 
 bool SplitAlignedScalars::isDclCandidate(G4_Declare *dcl) {
-  if (dcl->getRegFile() == G4_RegFileKind::G4_GRF && dcl->getNumElems() == 1 &&
+  if (dcl->getRegFile() == G4_RegFileKind::G4_GRF &&
+      (dcl->getNumElems() == 1 ||
+       (dcl->getNumElems() == 2 && dcl->getElemType() == Type_D)) &&
       !dcl->getAddressed() && !dcl->getIsPartialDcl() &&
       !dcl->getRegVar()->isPhyRegAssigned() && !dcl->getAliasDeclare() &&
       !dcl->isInput() && !dcl->isOutput() && !dcl->isPayloadLiveOut() &&
@@ -325,9 +339,15 @@ void SplitAlignedScalars::run() {
     return newTopDcl;
   };
 
-  auto getTypeExecSize = [&](G4_Type type) {
-    if (TypeSize(type) < 8)
-      return std::make_tuple(1, type);
+  auto getTypeExecSize = [&](G4_Type type, bool isScalar, G4_ExecSize execSize) {
+    if (TypeSize(type) < 8) {
+      if (execSize == g4::SIMD1 || isScalar)
+        return std::make_tuple(1, type);
+      else {
+        vISA_ASSERT(execSize == g4::SIMD2, "invalid execution size");
+        return std::make_tuple(2, type);
+      }
+    }
 
     if (kernel.fg.builder->noInt64())
       return std::make_tuple(2, Type_UD);
@@ -380,8 +400,10 @@ void SplitAlignedScalars::run() {
 
           // emit copy to store data to original non-aligned scalar
           unsigned int execSize = 1;
+          auto oldExecSize = inst->getExecSize();
           G4_Type typeToUse = Type_UD;
-          std::tie(execSize, typeToUse) = getTypeExecSize(dst->getType());
+          std::tie(execSize, typeToUse) =
+              getTypeExecSize(dst->getType(), false, oldExecSize);
 
           auto src = kernel.fg.builder->createSrc(
               dstRgn->getBase(), dstRgn->getRegOff(), dstRgn->getSubRegOff(),
@@ -437,8 +459,18 @@ void SplitAlignedScalars::run() {
             newAlignedTmpTopDcl->copyAlign(oldTopDcl);
 
             unsigned int execSize = 1;
+            auto oldExecSize = inst->getExecSize();
             G4_Type typeToUse = Type_UD;
-            std::tie(execSize, typeToUse) = getTypeExecSize(srcRgn->getType());
+            G4_Type srcType = srcRgn->getType();
+            bool isScalar = srcRgn->isScalar();
+            if (inst->isSend() && isScalar && oldTopDcl->getNumElems() == 2 &&
+                TypeSize(oldTopDcl->getElemType()) == 4) {
+              // For SIMD1 send with :d type data payload (e.g., d32x2t),
+              // create a move to copy both dwords when replacing source
+              srcType = Type_UQ;
+            }
+            std::tie(execSize, typeToUse) =
+                getTypeExecSize(srcType, isScalar, oldExecSize);
 
             // copy oldDcl in to newAlignedTmpTopDcl
             auto tmpDst = kernel.fg.builder->createDst(
diff --git a/visa/SplitAlignedScalars.h b/visa/SplitAlignedScalars.h
@@ -14,12 +14,13 @@ SPDX-License-Identifier: MIT
 namespace vISA {
 class SplitAlignedScalars {
 private:
-  const unsigned int MinOptDist = 200;
   // Constant trip count assume for each loop to estimate dynamic inst
   // count change due to splitting.
   const unsigned int EstimatedLoopTripCount = 4;
+  // Minimum instruction distance required for splitting
+  unsigned int MinOptDist = 0;
   // Threshold percent increase in estimated dynamic inst count allowed
-  const float BloatAllowed = 1.0f / 100.0f;
+  float BloatAllowed = 0.0f;
 
   unsigned int numDclsReplaced = 0;
   unsigned int numMovsAdded = 0;
@@ -65,6 +66,10 @@ class SplitAlignedScalars {
 public:
   SplitAlignedScalars(GlobalRA &g, GraphColor &c)
       : gra(g), coloring(c), kernel(g.kernel) {
+    MinOptDist =
+        g.kernel.getOptions()->getuInt32Option(vISA_SplitAlignedScalarMinDist);
+    BloatAllowed = g.kernel.getOptions()->getuInt32Option(
+        vISA_SplitAlignedScalarBloatPPT) / 1000.0f;
     for (auto spill : coloring.getSpilledLiveRanges()) {
       spilledDclSet.insert(spill->getDcl());
     }
diff --git a/visa/include/VISAOptionsDefs.h b/visa/include/VISAOptionsDefs.h
@@ -278,6 +278,7 @@ DEF_VISA_OPTION(vISA_AbortOnSpillThreshold, ET_INT32, "-abortOnSpill", UNUSED,
 DEF_VISA_OPTION(vISA_enableBCR, ET_BOOL, "-enableBCR", UNUSED, false)
 DEF_VISA_OPTION(vISA_forceBCR, ET_BOOL, "-forceBCR", UNUSED, false)
 
+
 // clang-format off
 // Enable bundle conflict reduction: put operands of instruction into different GRF bundles.
 // Value: 0 disable, 1 dpas instruction, 2 non-dpas instructions, 3 all instructions
@@ -321,7 +322,12 @@ DEF_VISA_OPTION(vISA_FillConstOpt, ET_BOOL, "-nofillconstopt", UNUSED, true)
 DEF_VISA_OPTION(vISA_GCRRInFF, ET_BOOL, "-GCRRinFF", UNUSED, false)
 DEF_VISA_OPTION(vISA_IncrementalRA, ET_INT32, "-incrementalra",
                 "USAGE: -incrementalra <0|1|2> where 0 is disabled, 1 is enabled, 2 is enabled with verification", 0)
-
+DEF_VISA_OPTION(vISA_SplitAlignedScalarMinDist, ET_INT32,
+                "-splitAlignedScalarMinDist",
+                "dist threshold for controlling when to split aligned scalars in RA", 200)
+DEF_VISA_OPTION(vISA_SplitAlignedScalarBloatPPT, ET_INT32,
+                "-splitAlignedScalarBloatRatio",
+                "instuction increase ppt (part per thousand) for controlling when to split aligned scalars in RA", 10)
 //=== scheduler options ===
 DEF_VISA_OPTION(vISA_LocalScheduling, ET_BOOL, "-noschedule", UNUSED, true)
 DEF_VISA_OPTION(vISA_preRA_Schedule, ET_BOOL, "-nopresched", UNUSED, true)