n\a

bcheng0127 · igcbot · commit e7d4789cb660 · 2021-07-13T18:51:20.000+02:00
n\a
diff --git a/visa/BuildIR.h b/visa/BuildIR.h
@@ -345,6 +345,9 @@ class IR_Builder
     G4_Declare* builtinScratchSurface = nullptr;
     G4_Declare* scratchSurfaceOffset = nullptr; // if scratch surface is used, this will be initialized once at entry
 
+    //The temp var for eu fusion W/A
+    G4_Declare* euFusionWATmpVar = nullptr;
+
     // Indicates that sampler header cache (builtinSamplerHeader) is correctly
     // initialized with r0 contents.
     // Used only when vISA_cacheSamplerHeader option is set.
@@ -621,6 +624,8 @@ class IR_Builder
 
     G4_Declare* getSpillFillHeader();
 
+    G4_Declare* getEUFusionWATmpVar();
+
     G4_Declare* getOldA0Dot2Temp();
     bool hasValidOldA0Dot2() { return oldA0Dot2Temp; }
 
@@ -772,6 +777,8 @@ class IR_Builder
 
     G4_INST* createPseudoKill(G4_Declare* dcl, PseudoKillType ty);
 
+    G4_INST* createEUWASpill(bool addToInstList);
+
     // numRows is in hword units
     // offset is in hword units
     G4_INST* createSpill(
diff --git a/visa/BuildIRImpl.cpp b/visa/BuildIRImpl.cpp
@@ -710,6 +710,18 @@ G4_Declare* IR_Builder::getSpillFillHeader()
     return spillFillHeader;
 }
 
+G4_Declare* IR_Builder::getEUFusionWATmpVar()
+{
+    if (!euFusionWATmpVar)
+    {
+        euFusionWATmpVar = createTempVar(2, Type_UD, Even_Word, "euFusionWATmp");
+        euFusionWATmpVar->setLiveOut();
+        euFusionWATmpVar->setLiveIn();
+        euFusionWATmpVar->setDoNotSpill();
+    }
+    return euFusionWATmpVar;
+}
+
 G4_Declare* IR_Builder::getOldA0Dot2Temp()
 {
     if (!oldA0Dot2Temp)
@@ -959,6 +971,22 @@ G4_INST* IR_Builder::createPseudoKill(G4_Declare* dcl, PseudoKillType ty)
 
 static const unsigned int HWORD_BYTE_SIZE = 32;
 
+
+G4_INST* IR_Builder::createEUWASpill(bool addToInstList)
+{
+    const RegionDesc* rd = getRegionScalar();
+
+    G4_Declare* dcl = getEUFusionWATmpVar();
+    G4_SrcRegRegion* pseudoUseSrc =
+        createSrc(dcl->getRegVar(), 0, 0, rd, Type_UD);
+
+    G4_INST* pseudoUseInst = createIntrinsicInst(
+        nullptr, Intrinsic::FlagSpill, g4::SIMD2,
+        nullptr, pseudoUseSrc, nullptr, nullptr, InstOpt_NoOpt, addToInstList);
+
+    return pseudoUseInst;
+}
+
 G4_INST* IR_Builder::createSpill(
     G4_DstRegRegion* dst, G4_SrcRegRegion* header, G4_SrcRegRegion* payload,
     G4_ExecSize execSize,
@@ -971,6 +999,7 @@ G4_INST* IR_Builder::createSpill(
     spill->asSpillIntrinsic()->setOffset((uint32_t)
         (((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
     spill->asSpillIntrinsic()->setNumRows(numRows);
+
     return spill;
 }
 
diff --git a/visa/G4_IR.hpp b/visa/G4_IR.hpp
@@ -477,6 +477,7 @@ typedef struct _SWSBInfo
     bool isPseudoKill() const;
     bool isLifeTimeEnd() const;
     bool isSpillIntrinsic() const;
+    bool isFlagSpillIntrinsic() const;
     G4_SpillIntrinsic* asSpillIntrinsic() const;
     bool isFillIntrinsic() const;
     G4_FillIntrinsic* asFillIntrinsic() const;
@@ -1502,6 +1503,7 @@ enum class Intrinsic
     CallerRestore,
     CalleeSave,
     CalleeRestore,
+    FlagSpill,
     NumIntrinsics
 };
 
@@ -1546,6 +1548,7 @@ static const IntrinsicInfo G4_Intrinsics[(int)Intrinsic::NumIntrinsics] =
     {Intrinsic::CallerRestore,  "caller_restore", 0,    1,      Phase::RA,              { 0, 0, 0, false, false } },
     {Intrinsic::CalleeSave,     "callee_save",  1,      0,      Phase::RA,              { 0, 0, 0, false, false } },
     {Intrinsic::CalleeRestore,  "callee_restore", 0,    1,      Phase::RA,              { 0, 0, 0, false, false } },
+    {Intrinsic::FlagSpill,            "flagSpill",          0,      1,      Phase::RA,       { 0, 0, 0, false, false } },
 };
 
 namespace vISA
diff --git a/visa/GraphColor.cpp b/visa/GraphColor.cpp
@@ -9766,8 +9766,22 @@ int GlobalRA::coloringRegAlloc()
     // declares and code. This currently must be done after flag/addr RA due to
     // the assumption about the location of the pseudo save/restore instructions
     //
+    bool euWADone = false;
     if (hasStackCall)
     {
+        if (builder.hasFusedEUWA() && !euWADone)
+        {
+            G4_INST* euWAInst = builder.createEUWASpill(false);
+            G4_BB* entryBB = (*kernel.fg.begin());
+            INST_LIST_ITER inst_it = entryBB->begin();
+            while ((*inst_it)->isLabel())
+            {
+                inst_it++;
+            }
+            entryBB->insertBefore(inst_it, euWAInst);
+            euWADone = true;
+        }
+
         addCallerSavePseudoCode();
 
         // Only GENX sub-graphs require callee-save code.
@@ -10174,6 +10188,19 @@ int GlobalRA::coloringRegAlloc()
                 bool success = spillGRF.insertSpillFillCode(&kernel, pointsToAnalysis);
                 nextSpillOffset = spillGRF.getNextOffset();
 
+                if (builder.hasFusedEUWA() && !euWADone)
+                {
+                    G4_INST * euWAInst = builder.createEUWASpill(false);
+                    G4_BB* entryBB = (*kernel.fg.begin());
+                    INST_LIST_ITER inst_it = entryBB->begin();
+                    while ((*inst_it)->isLabel())
+                    {
+                        inst_it++;
+                    }
+                    entryBB->insertBefore(inst_it, euWAInst);
+                    euWADone = true;
+                }
+
                 if (builder.hasScratchSurface() && !hasStackCall &&
                     (nextSpillOffset + globalScratchOffset) > SCRATCH_MSG_LIMIT)
                 {
@@ -10250,7 +10277,9 @@ int GlobalRA::coloringRegAlloc()
                     // it modifies IR
                     regChart->dumpRegChart(std::cerr);
                 }
+
                 expandSpillFillIntrinsics(nextSpillOffset);
+
                 if (builder.getOption(vISA_OptReport))
                 {
                     detectUndefinedUses(liveAnalysis, kernel);
diff --git a/visa/HWCaps.inc b/visa/HWCaps.inc
@@ -779,6 +779,12 @@ SPDX-License-Identifier: MIT
         return (getPlatform() == XE_HP);
     }
 
+    bool hasFusedEUWA() const
+    {
+        return ((getuint32Option(vISA_noMaskWA) & 0x3) > 0 ||
+                getOption(vISA_forceNoMaskWA));
+    }
+
     bool hasFusedEU() const
     {
         return (getPlatform() == GENX_TGLLP || getPlatform() == XE_HP);
diff --git a/visa/Optimizer.cpp b/visa/Optimizer.cpp
@@ -7177,6 +7177,13 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
     // some workaround for HW restrictions.  We apply them here so as not to affect optimizations, RA, and scheduling
     void Optimizer::HWWorkaround()
     {
+        if ((kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM) &&
+            builder.getJitInfo()->spillMemUsed > 0 && builder.hasFusedEUWA())
+        {
+            // For now, do it for CM/VC. Will turn it on for all.
+            doNoMaskWA_postRA();
+        }
+
         // Ensure the first instruction of a stack function has switch option.
         if (fg.getIsStackCallFunc() &&
             VISA_WA_CHECK(builder.getPWaTable(), WaThreadSwitchAfterCall) &&
@@ -10543,6 +10550,7 @@ void Optimizer::removeInstrinsics()
     for (auto bb : kernel.fg)
     {
         bb->removeIntrinsics(Intrinsic::MemFence);
+        bb->removeIntrinsics(Intrinsic::FlagSpill);
     }
 }
 
@@ -12020,6 +12028,185 @@ void Optimizer::doNoMaskWA()
     }
 }
 
+// Need to apply NoMaskWA on spill.  For example,
+//   Think of scenario that fusedMask should be off, but it is on due to the HW bug.
+//   Instruction with NoMask will run, and all the others do not.
+//
+//   V77 (2GRF) spills at offset[4x32]. The following code reads V77 from spill
+//   location, and modifies it, and finally write the result back into offset[4xi32].
+//   If the code can keep the content at this location unchanged, no WA is needed;
+//   otherwise, we must have WA.
+//
+//   But write at (3) will write whatever in r4 into offset[4x32],  which is undefined,
+//   definitely not guaranteed to be the same as r1 just read from the same location.
+//   (Note that mul at (2) will not run because the channel enable is off [only fusedMask
+//   is on].)  This shows the code modifies the content at offset[4x32], which is wrong.
+//
+//   With this, the WA must be applied. It is enough to apply on spill (write) only.
+//
+//   Before RA:
+//     BB1:
+//       mul (M1, 16) V77(0,0)<1> V141(0,0)<0;1,0> V77(0,0)<1;1,0>
+//     BB2:
+//       svm_block_st (4) V154(0,0)<0;1,0> V77.0
+//
+//   After RA
+//     BB1:
+//      (1)  // wr:1h+0, rd:2; hword scratch block read x2
+//           // scratch space fill: FL_GRF_V77_6 from offset[4x32]
+//           (W) send.dc0 (16|M0)  r1  r0  null  0x0  0x022C1004
+//      (2)  mul (16|M0)  r4.0<1>:f  r3.0<0;1,0>:f  r1.0<8;8,1>:f
+//      (3)  // wr:1h+2, rd:0; hword scratch block write x2
+//           //  scratch space spill: SP_GRF_V77_3 from offset[4x32];
+//           (W) send.dc0 (16|M0)  null  r0  r4  0x80  0x020F1004
+//
+// Note this works only for NoMaskWA=2
+//
+void Optimizer::doNoMaskWA_postRA()
+{
+    std::vector<INST_LIST_ITER> NoMaskCandidates;
+    G4_ExecSize simdsize = fg.getKernel()->getSimdSize();
+
+    auto isCandidate = [](G4_INST* I) {
+        if (I->isSend() && I->isWriteEnableInst() &&
+            I->getPredicate() == nullptr &&
+            (I->getDst() == nullptr || I->getDst()->isNullReg()))
+        {
+            // This shall be a spill (write).
+            // May check if the spilled var is global. We only need
+            // to do WA for global spill!
+            return true;
+        }
+        return false;
+    };
+
+    auto createFlagFromCmp = [&](G4_BB* BB, INST_LIST_ITER& InsertPos,
+        G4_RegVar* flag, unsigned flagOff, G4_Type Ty)
+    {
+        //    I0:               (W) mov (1|M0)  flag:Ty,  0
+        //    flagDefInst:          cmp (simdsize|M0) (eq)flag  r0:uw  r0:uw
+        G4_DstRegRegion* D = builder.createDst(flag, 0, flagOff, 1, Ty);
+        G4_INST* I0 = builder.createMov(g4::SIMD1, D, builder.createImm(0, Ty), InstOpt_WriteEnable, false);
+        BB->insertBefore(InsertPos, I0);
+
+        G4_SrcRegRegion* r0_0 = builder.createSrc(
+            builder.getRealR0()->getRegVar(), 0, 0,
+            builder.getRegionScalar(), Type_UW);
+        G4_SrcRegRegion* r0_1 = builder.createSrc(
+            builder.getRealR0()->getRegVar(), 0, 0,
+            builder.getRegionScalar(), Type_UW);
+        G4_CondMod* flagCM = builder.createCondMod(Mod_e, flag, flagOff);
+        G4_DstRegRegion* nullDst = builder.createNullDst(Type_UW);
+        G4_INST* I1 = builder.createInternalInst(
+            NULL, G4_cmp, flagCM, g4::NOSAT, simdsize,
+            nullDst, r0_0, r0_1, InstOpt_M0);
+        BB->insertBefore(InsertPos, I1);
+    };
+
+    auto createMov1 = [&](G4_BB* BB, INST_LIST_ITER& InsertPos,
+        G4_RegVar* Dst, unsigned Dst_off, G4_RegVar* Src, unsigned Src_off, G4_Type Ty)
+    {
+        G4_DstRegRegion* D = builder.createDst(Dst, 0, Dst_off, 1, Ty);
+        G4_SrcRegRegion* S = builder.createSrc(Src, 0, Src_off, builder.getRegionScalar(), Ty);
+        G4_INST* tI = builder.createMov(g4::SIMD1, D, S, InstOpt_WriteEnable, false);
+        BB->insertBefore(InsertPos, tI);
+    };
+
+    // Assuming all flags are used, thus need to spill one.
+    // RA reserves two DW for this purpose:
+    //    DW0:  <original flag>
+    //    DW1:  <WA flag>        // set once and reuse it in the same BB
+    // For example,  the following spill send needs WA:
+    //    (W) send  (16|M0) ...
+    // Let's say we use f0.0, WA sequence is as follows:
+    //    1.  (W) mov (1|M0)  DW0:uw   f0.0<0;1,0>:uw         // save
+    //    2.  (W) mov (1|M0)  f0.0<1>:uw  0:uw
+    //    3.  cmp (16|M0)   (eq)f0.0   null<1>:uw  r0.0<0;1,0>:uw  r0.0<0;1,0>:uw
+    //    4.  (W) mov (1|M0)  DW1:uw   f0.0<0;1,0>:uw         // WASave
+    //        (W & f0.0.any16h) send (16|M0) ...
+    //    5.  (W) mov (1|M0) f0.0<1>:uw  DW0:uw               // restore
+    // Note that 2,3, and 4 are needed once per BB. They are done for the first WA send.
+    // If there are more WA sends in the same BB, the WA send after the 1st needs to have
+    //    1.  (W) mov (1|M0)  DW0:uw   f0.0<0;1,0>:uw         // save
+    //    2.  (W) mov (1|M0)  f0.0<1>:uw   DW1:uw             // WARestore
+    //        (W & f0.0.any16h) send (16|M0) ...
+    //    3.  (W) mov (1|M0) f0.0<1>:uw  DW0:uw               // restore
+    //
+    // Todo:  check if save/restore is needed to avoid redundant save/restore.
+    //
+    G4_Declare* saveTmp = builder.getEUFusionWATmpVar(); // 2DW;
+    G4_RegVar* saveVar = saveTmp->getRegVar();
+    G4_Predicate_Control waPredCtrl =
+        (simdsize == 8 ? PRED_ANY8H
+                       : (simdsize == 16 ? PRED_ANY16H : PRED_ANY32H));
+    unsigned saveOff = 0, waSaveOff = (simdsize == 32 ? 1 : 2);
+
+    for (auto BI : fg)
+    {
+        G4_BB* BB = BI;
+        if ((BB->getBBType() & G4_BB_NM_WA_TYPE) == 0)
+        {
+            continue;
+        }
+
+        // per-BB insts that need NoMaskWA (aka WA inst)
+        std::vector<INST_LIST_ITER> WAInsts;
+
+        // First collect all candidates and also check if there is
+        // any free flag registers
+        for (auto II = BB->begin(), IE = BB->end(); II != IE; ++II)
+        {
+            G4_INST* I = *II;
+            if (!isCandidate(I))
+            {
+                continue;
+            }
+            WAInsts.push_back(II);
+        }
+
+        if (WAInsts.empty())
+        {
+            continue;
+        }
+
+        // Without optimization, always do save/restore
+        bool needSave = true;
+        bool needRestore = true;
+        G4_Type Ty = (simdsize > 16) ? Type_UD : Type_UW;
+        G4_Declare* flagDcl = builder.createTempFlag((Ty == Type_UW ? 1 : 2), "waflag");
+        G4_RegVar* flagVar = flagDcl->getRegVar();
+        flagVar->setPhyReg(builder.phyregpool.getFlagAreg(0), 0);
+
+        // Save flag, create WA mask, save WAflag
+        createMov1(BB, WAInsts[0], saveVar, saveOff, flagVar, 0, Ty);  // save
+        createFlagFromCmp(BB, WAInsts[0], flagVar, 0, Ty);
+        if (WAInsts.size() > 1) {
+            createMov1(BB, WAInsts[0], saveVar, waSaveOff, flagVar, 0, Ty); // WASave
+        }
+
+        for (int i = 0, sz = (int)WAInsts.size(); i < sz; ++i)
+        {
+            auto& currII = WAInsts[i];
+
+            if (i > 0 && needSave) {
+                createMov1(BB, currII, saveVar, saveOff, flagVar, 0, Ty);    // save
+                createMov1(BB, currII, flagVar, 0, saveVar, waSaveOff, Ty);  // WARestore
+            }
+
+            G4_INST* I = *currII;
+            G4_Predicate* newPred = builder.createPredicate(
+                PredState_Plus, flagVar, 0, waPredCtrl);
+            I->setPredicate(newPred);
+
+            if (i == (sz - 1) || needRestore) {
+                auto nextII = currII;
+                ++nextII;
+                createMov1(BB, nextII, flagVar, 0, saveVar, saveOff, Ty);   // restore
+            }
+        }
+    }
+}
+
 // Convert vISA MULH dst:d src0:d src1:d into
 //    mul acc0.0<1>:d src0:d src1:w
 //    mach dst:d src0:d src1:d
diff --git a/visa/Optimizer.h b/visa/Optimizer.h
@@ -240,6 +240,7 @@ class Optimizer
     void setA0toTdrForSendc();
     void replaceRetWithJmpi();
     void doNoMaskWA();
+    void doNoMaskWA_postRA();
     void insertFenceAtEntry();
     void expandMulPostSchedule();
     void expandMadwPostSchedule();

Original file line number	Diff line number	Diff line change
`@@ -779,6 +779,12 @@ SPDX-License-Identifier: MIT`
`779`	`779`	`return (getPlatform() == XE_HP);`
`780`	`780`	`}`
`781`	`781`
	`782`	`+ bool hasFusedEUWA() const`
	`783`	`+ {`
	`784`	`+ return ((getuint32Option(vISA_noMaskWA) & 0x3) > 0 \|\|`
	`785`	`+ getOption(vISA_forceNoMaskWA));`
	`786`	`+ }`
	`787`	`+`
`782`	`788`	`bool hasFusedEU() const`
`783`	`789`	`{`
`784`	`790`	`return (getPlatform() == GENX_TGLLP \|\| getPlatform() == XE_HP);`