@@ -7177,6 +7177,13 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
7177
7177
// some workaround for HW restrictions. We apply them here so as not to affect optimizations, RA, and scheduling
7178
7178
void Optimizer::HWWorkaround ()
7179
7179
{
7180
+ if ((kernel.getInt32KernelAttr (Attributes::ATTR_Target) == VISA_CM) &&
7181
+ builder.getJitInfo ()->spillMemUsed > 0 && builder.hasFusedEUWA ())
7182
+ {
7183
+ // For now, do it for CM/VC. Will turn it on for all.
7184
+ doNoMaskWA_postRA ();
7185
+ }
7186
+
7180
7187
// Ensure the first instruction of a stack function has switch option.
7181
7188
if (fg.getIsStackCallFunc () &&
7182
7189
VISA_WA_CHECK (builder.getPWaTable (), WaThreadSwitchAfterCall) &&
@@ -10543,6 +10550,7 @@ void Optimizer::removeInstrinsics()
10543
10550
for (auto bb : kernel.fg )
10544
10551
{
10545
10552
bb->removeIntrinsics (Intrinsic::MemFence);
10553
+ bb->removeIntrinsics (Intrinsic::FlagSpill);
10546
10554
}
10547
10555
}
10548
10556
@@ -12020,6 +12028,185 @@ void Optimizer::doNoMaskWA()
12020
12028
}
12021
12029
}
12022
12030
12031
+ // Need to apply NoMaskWA on spill. For example,
12032
+ // Think of scenario that fusedMask should be off, but it is on due to the HW bug.
12033
+ // Instruction with NoMask will run, and all the others do not.
12034
+ //
12035
+ // V77 (2GRF) spills at offset[4x32]. The following code reads V77 from spill
12036
+ // location, and modifies it, and finally write the result back into offset[4xi32].
12037
+ // If the code can keep the content at this location unchanged, no WA is needed;
12038
+ // otherwise, we must have WA.
12039
+ //
12040
+ // But write at (3) will write whatever in r4 into offset[4x32], which is undefined,
12041
+ // definitely not guaranteed to be the same as r1 just read from the same location.
12042
+ // (Note that mul at (2) will not run because the channel enable is off [only fusedMask
12043
+ // is on].) This shows the code modifies the content at offset[4x32], which is wrong.
12044
+ //
12045
+ // With this, the WA must be applied. It is enough to apply on spill (write) only.
12046
+ //
12047
+ // Before RA:
12048
+ // BB1:
12049
+ // mul (M1, 16) V77(0,0)<1> V141(0,0)<0;1,0> V77(0,0)<1;1,0>
12050
+ // BB2:
12051
+ // svm_block_st (4) V154(0,0)<0;1,0> V77.0
12052
+ //
12053
+ // After RA
12054
+ // BB1:
12055
+ // (1) // wr:1h+0, rd:2; hword scratch block read x2
12056
+ // // scratch space fill: FL_GRF_V77_6 from offset[4x32]
12057
+ // (W) send.dc0 (16|M0) r1 r0 null 0x0 0x022C1004
12058
+ // (2) mul (16|M0) r4.0<1>:f r3.0<0;1,0>:f r1.0<8;8,1>:f
12059
+ // (3) // wr:1h+2, rd:0; hword scratch block write x2
12060
+ // // scratch space spill: SP_GRF_V77_3 from offset[4x32];
12061
+ // (W) send.dc0 (16|M0) null r0 r4 0x80 0x020F1004
12062
+ //
12063
+ // Note this works only for NoMaskWA=2
12064
+ //
12065
+ void Optimizer::doNoMaskWA_postRA ()
12066
+ {
12067
+ std::vector<INST_LIST_ITER> NoMaskCandidates;
12068
+ G4_ExecSize simdsize = fg.getKernel ()->getSimdSize ();
12069
+
12070
+ auto isCandidate = [](G4_INST* I) {
12071
+ if (I->isSend () && I->isWriteEnableInst () &&
12072
+ I->getPredicate () == nullptr &&
12073
+ (I->getDst () == nullptr || I->getDst ()->isNullReg ()))
12074
+ {
12075
+ // This shall be a spill (write).
12076
+ // May check if the spilled var is global. We only need
12077
+ // to do WA for global spill!
12078
+ return true ;
12079
+ }
12080
+ return false ;
12081
+ };
12082
+
12083
+ auto createFlagFromCmp = [&](G4_BB* BB, INST_LIST_ITER& InsertPos,
12084
+ G4_RegVar* flag, unsigned flagOff, G4_Type Ty)
12085
+ {
12086
+ // I0: (W) mov (1|M0) flag:Ty, 0
12087
+ // flagDefInst: cmp (simdsize|M0) (eq)flag r0:uw r0:uw
12088
+ G4_DstRegRegion* D = builder.createDst (flag, 0 , flagOff, 1 , Ty);
12089
+ G4_INST* I0 = builder.createMov (g4::SIMD1, D, builder.createImm (0 , Ty), InstOpt_WriteEnable, false );
12090
+ BB->insertBefore (InsertPos, I0);
12091
+
12092
+ G4_SrcRegRegion* r0_0 = builder.createSrc (
12093
+ builder.getRealR0 ()->getRegVar (), 0 , 0 ,
12094
+ builder.getRegionScalar (), Type_UW);
12095
+ G4_SrcRegRegion* r0_1 = builder.createSrc (
12096
+ builder.getRealR0 ()->getRegVar (), 0 , 0 ,
12097
+ builder.getRegionScalar (), Type_UW);
12098
+ G4_CondMod* flagCM = builder.createCondMod (Mod_e, flag, flagOff);
12099
+ G4_DstRegRegion* nullDst = builder.createNullDst (Type_UW);
12100
+ G4_INST* I1 = builder.createInternalInst (
12101
+ NULL , G4_cmp, flagCM, g4::NOSAT, simdsize,
12102
+ nullDst, r0_0, r0_1, InstOpt_M0);
12103
+ BB->insertBefore (InsertPos, I1);
12104
+ };
12105
+
12106
+ auto createMov1 = [&](G4_BB* BB, INST_LIST_ITER& InsertPos,
12107
+ G4_RegVar* Dst, unsigned Dst_off, G4_RegVar* Src, unsigned Src_off, G4_Type Ty)
12108
+ {
12109
+ G4_DstRegRegion* D = builder.createDst (Dst, 0 , Dst_off, 1 , Ty);
12110
+ G4_SrcRegRegion* S = builder.createSrc (Src, 0 , Src_off, builder.getRegionScalar (), Ty);
12111
+ G4_INST* tI = builder.createMov (g4::SIMD1, D, S, InstOpt_WriteEnable, false );
12112
+ BB->insertBefore (InsertPos, tI);
12113
+ };
12114
+
12115
+ // Assuming all flags are used, thus need to spill one.
12116
+ // RA reserves two DW for this purpose:
12117
+ // DW0: <original flag>
12118
+ // DW1: <WA flag> // set once and reuse it in the same BB
12119
+ // For example, the following spill send needs WA:
12120
+ // (W) send (16|M0) ...
12121
+ // Let's say we use f0.0, WA sequence is as follows:
12122
+ // 1. (W) mov (1|M0) DW0:uw f0.0<0;1,0>:uw // save
12123
+ // 2. (W) mov (1|M0) f0.0<1>:uw 0:uw
12124
+ // 3. cmp (16|M0) (eq)f0.0 null<1>:uw r0.0<0;1,0>:uw r0.0<0;1,0>:uw
12125
+ // 4. (W) mov (1|M0) DW1:uw f0.0<0;1,0>:uw // WASave
12126
+ // (W & f0.0.any16h) send (16|M0) ...
12127
+ // 5. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
12128
+ // Note that 2,3, and 4 are needed once per BB. They are done for the first WA send.
12129
+ // If there are more WA sends in the same BB, the WA send after the 1st needs to have
12130
+ // 1. (W) mov (1|M0) DW0:uw f0.0<0;1,0>:uw // save
12131
+ // 2. (W) mov (1|M0) f0.0<1>:uw DW1:uw // WARestore
12132
+ // (W & f0.0.any16h) send (16|M0) ...
12133
+ // 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
12134
+ //
12135
+ // Todo: check if save/restore is needed to avoid redundant save/restore.
12136
+ //
12137
+ G4_Declare* saveTmp = builder.getEUFusionWATmpVar (); // 2DW;
12138
+ G4_RegVar* saveVar = saveTmp->getRegVar ();
12139
+ G4_Predicate_Control waPredCtrl =
12140
+ (simdsize == 8 ? PRED_ANY8H
12141
+ : (simdsize == 16 ? PRED_ANY16H : PRED_ANY32H));
12142
+ unsigned saveOff = 0 , waSaveOff = (simdsize == 32 ? 1 : 2 );
12143
+
12144
+ for (auto BI : fg)
12145
+ {
12146
+ G4_BB* BB = BI;
12147
+ if ((BB->getBBType () & G4_BB_NM_WA_TYPE) == 0 )
12148
+ {
12149
+ continue ;
12150
+ }
12151
+
12152
+ // per-BB insts that need NoMaskWA (aka WA inst)
12153
+ std::vector<INST_LIST_ITER> WAInsts;
12154
+
12155
+ // First collect all candidates and also check if there is
12156
+ // any free flag registers
12157
+ for (auto II = BB->begin (), IE = BB->end (); II != IE; ++II)
12158
+ {
12159
+ G4_INST* I = *II;
12160
+ if (!isCandidate (I))
12161
+ {
12162
+ continue ;
12163
+ }
12164
+ WAInsts.push_back (II);
12165
+ }
12166
+
12167
+ if (WAInsts.empty ())
12168
+ {
12169
+ continue ;
12170
+ }
12171
+
12172
+ // Without optimization, always do save/restore
12173
+ bool needSave = true ;
12174
+ bool needRestore = true ;
12175
+ G4_Type Ty = (simdsize > 16 ) ? Type_UD : Type_UW;
12176
+ G4_Declare* flagDcl = builder.createTempFlag ((Ty == Type_UW ? 1 : 2 ), " waflag" );
12177
+ G4_RegVar* flagVar = flagDcl->getRegVar ();
12178
+ flagVar->setPhyReg (builder.phyregpool .getFlagAreg (0 ), 0 );
12179
+
12180
+ // Save flag, create WA mask, save WAflag
12181
+ createMov1 (BB, WAInsts[0 ], saveVar, saveOff, flagVar, 0 , Ty); // save
12182
+ createFlagFromCmp (BB, WAInsts[0 ], flagVar, 0 , Ty);
12183
+ if (WAInsts.size () > 1 ) {
12184
+ createMov1 (BB, WAInsts[0 ], saveVar, waSaveOff, flagVar, 0 , Ty); // WASave
12185
+ }
12186
+
12187
+ for (int i = 0 , sz = (int )WAInsts.size (); i < sz; ++i)
12188
+ {
12189
+ auto & currII = WAInsts[i];
12190
+
12191
+ if (i > 0 && needSave) {
12192
+ createMov1 (BB, currII, saveVar, saveOff, flagVar, 0 , Ty); // save
12193
+ createMov1 (BB, currII, flagVar, 0 , saveVar, waSaveOff, Ty); // WARestore
12194
+ }
12195
+
12196
+ G4_INST* I = *currII;
12197
+ G4_Predicate* newPred = builder.createPredicate (
12198
+ PredState_Plus, flagVar, 0 , waPredCtrl);
12199
+ I->setPredicate (newPred);
12200
+
12201
+ if (i == (sz - 1 ) || needRestore) {
12202
+ auto nextII = currII;
12203
+ ++nextII;
12204
+ createMov1 (BB, nextII, flagVar, 0 , saveVar, saveOff, Ty); // restore
12205
+ }
12206
+ }
12207
+ }
12208
+ }
12209
+
12023
12210
// Convert vISA MULH dst:d src0:d src1:d into
12024
12211
// mul acc0.0<1>:d src0:d src1:w
12025
12212
// mach dst:d src0:d src1:d
0 commit comments