@@ -7218,7 +7218,8 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
7218
7218
void Optimizer::HWWorkaround ()
7219
7219
{
7220
7220
if ((kernel.getInt32KernelAttr (Attributes::ATTR_Target) == VISA_CM) &&
7221
- builder.getJitInfo ()->spillMemUsed > 0 && builder.hasFusedEUWA ())
7221
+ builder.hasFusedEUWA () &&
7222
+ (builder.getJitInfo ()->spillMemUsed > 0 || builder.getJitInfo ()->numFlagSpillStore > 0 ))
7222
7223
{
7223
7224
// For now, do it for CM/VC. Will turn it on for all.
7224
7225
doNoMaskWA_postRA ();
@@ -11917,6 +11918,10 @@ void Optimizer::doNoMaskWA()
11917
11918
for (auto II = BB->begin (), IE = BB->end (); II != IE; ++II)
11918
11919
{
11919
11920
G4_INST* I = *II;
11921
+
11922
+ // Mark all instruction as created by preRA to avoid re-processing postRA
11923
+ I->setCreatedPreRA (true );
11924
+
11920
11925
if (!isCandidateInst (I, fg))
11921
11926
{
11922
11927
continue ;
@@ -12103,21 +12108,48 @@ void Optimizer::doNoMaskWA()
12103
12108
// // scratch space spill: SP_GRF_V77_3 from offset[4x32];
12104
12109
// (W) send.dc0 (16|M0) null r0 r4 0x80 0x020F1004
12105
12110
//
12111
+ // For flag spill:
12112
+ // Need WA as well due to the following case:
12113
+ //
12114
+ // After RA:
12115
+ // BB_19:
12116
+ // (W) mov (1|M0) r34.8<1>:uw f0.1<0;1,0>:uw
12117
+ // ...
12118
+ // BB_21:
12119
+ // (W) mov (1|M0) f1.1<1>:uw r34.8<0;1,0>:uw
12120
+ //
12121
+ // If BB_19 should be skipped but runs due to this HW bug, r34.8 will be updated
12122
+ // with a f0.1, which is undefined value. And at BB_21, reading from r34.8 will
12123
+ // get garbage value!
12124
+ //
12106
12125
// Note this works only for NoMaskWA=2
12107
12126
//
12108
12127
void Optimizer::doNoMaskWA_postRA ()
12109
12128
{
12110
12129
std::vector<INST_LIST_ITER> NoMaskCandidates;
12111
12130
G4_ExecSize simdsize = fg.getKernel ()->getSimdSize ();
12131
+ const bool HasFlagSpill = (builder.getJitInfo ()->numFlagSpillStore > 0 );
12132
+
12133
+ auto isCandidate = [&](G4_INST* I) {
12134
+ if (I->getCreatedPreRA () || !I->isWriteEnableInst ())
12135
+ {
12136
+ return false ;
12137
+ }
12112
12138
12113
- auto isCandidate = [](G4_INST* I) {
12114
- if (I->isSend () && I->isWriteEnableInst () &&
12115
- I->getPredicate () == nullptr &&
12139
+ // If it is global flag spill or global grf spill, need to do WA.
12140
+ // For now, global checking is not available
12141
+
12142
+ // 1. flag spill
12143
+ if (HasFlagSpill &&
12144
+ I->isMov () && I->getSrc (0 ) && I->getSrc (0 )->isFlag () &&
12145
+ I->getExecSize () == g4::SIMD1 && I->getPredicate () == nullptr )
12146
+ {
12147
+ return true ;
12148
+ }
12149
+ // 2. GRF spill
12150
+ if (I->isSend () && I->getPredicate () == nullptr &&
12116
12151
(I->getDst () == nullptr || I->getDst ()->isNullReg ()))
12117
12152
{
12118
- // This shall be a spill (write).
12119
- // May check if the spilled var is global. We only need
12120
- // to do WA for global spill!
12121
12153
return true ;
12122
12154
}
12123
12155
return false ;
@@ -12175,6 +12207,16 @@ void Optimizer::doNoMaskWA_postRA()
12175
12207
// (W & f0.0.any16h) send (16|M0) ...
12176
12208
// 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
12177
12209
//
12210
+ // For flag spill, the sequence is the same as the above except for the case in which
12211
+ // the WAFlag is the same as spilled flag. For example,
12212
+ //
12213
+ // (W) mov (1|M0) r34.8<1>:uw f0.0<0;1,0>:uw
12214
+ //
12215
+ // 1. (W) mov (1|M0) DW0:uw f0.0<0;1,0>:uw // save
12216
+ // 2. (W) mov (1|M0) f0.0<1>:uw DW1:uw // WARestore
12217
+ // (W & f0.0.any16h) mov r34.8<1>:uw DW0.0<0;1,0>:uw
12218
+ // 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
12219
+ //
12178
12220
// Todo: check if save/restore is needed to avoid redundant save/restore.
12179
12221
//
12180
12222
G4_Declare* saveTmp = builder.getEUFusionWATmpVar (); // 2DW;
@@ -12215,10 +12257,15 @@ void Optimizer::doNoMaskWA_postRA()
12215
12257
// Without optimization, always do save/restore
12216
12258
bool needSave = true ;
12217
12259
bool needRestore = true ;
12260
+
12261
+ // wa flag register to use f(wafregnum, wafsregnum)
12262
+ uint32_t wafregnum = 0 ;
12263
+ uint32_t wafsregnum = 0 ;
12264
+
12218
12265
G4_Type Ty = (simdsize > 16 ) ? Type_UD : Type_UW;
12219
12266
G4_Declare* flagDcl = builder.createTempFlag ((Ty == Type_UW ? 1 : 2 ), " waflag" );
12220
12267
G4_RegVar* flagVar = flagDcl->getRegVar ();
12221
- flagVar->setPhyReg (builder.phyregpool .getFlagAreg (0 ), 0 );
12268
+ flagVar->setPhyReg (builder.phyregpool .getFlagAreg (wafregnum ), wafsregnum );
12222
12269
12223
12270
// Save flag, create WA mask, save WAflag
12224
12271
createMov1 (BB, WAInsts[0 ], saveVar, saveOff, flagVar, 0 , Ty); // save
@@ -12239,6 +12286,24 @@ void Optimizer::doNoMaskWA_postRA()
12239
12286
G4_INST* I = *currII;
12240
12287
G4_Predicate* newPred = builder.createPredicate (
12241
12288
PredState_Plus, flagVar, 0 , waPredCtrl);
12289
+ if (I->isMov () && I->getSrc (0 ) && I->getSrc (0 )->isFlag ())
12290
+ {
12291
+ G4_SrcRegRegion* srcReg = I->getSrc (0 )->asSrcRegRegion ();
12292
+ G4_RegVar* baseVar = static_cast <G4_RegVar*>(srcReg->getBase ());
12293
+ assert (baseVar->isPhyRegAssigned ());
12294
+
12295
+ // For flag, G4_Areg has flag number and G4_RegVar has subRefOff.
12296
+ // (SrcRegRegion's refOff/subRefOff is 0/0 always.)
12297
+ G4_Areg* flagReg = baseVar->getPhyReg ()->getAreg ();
12298
+ uint32_t subRegOff = baseVar->getPhyRegOff ();
12299
+ if (flagReg->getFlagNum () == wafregnum &&
12300
+ (Ty == Type_UD /* 32bit flag */ || subRegOff == wafsregnum /* 16bit flag */ ))
12301
+ {
12302
+ G4_SrcRegRegion* S = builder.createSrc (
12303
+ saveVar, 0 , saveOff, builder.getRegionScalar (), Ty);
12304
+ I->setSrc (S, 0 );
12305
+ }
12306
+ }
12242
12307
I->setPredicate (newPred);
12243
12308
12244
12309
if (i == (sz - 1 ) || needRestore) {
0 commit comments