Skip to content

Commit e7d4789

Browse files
bcheng0127igcbot
authored andcommitted
n\a
n\a
1 parent 1364bee commit e7d4789

File tree

7 files changed

+262
-0
lines changed

7 files changed

+262
-0
lines changed

visa/BuildIR.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,9 @@ class IR_Builder
345345
G4_Declare* builtinScratchSurface = nullptr;
346346
G4_Declare* scratchSurfaceOffset = nullptr; // if scratch surface is used, this will be initialized once at entry
347347

348+
//The temp var for eu fusion W/A
349+
G4_Declare* euFusionWATmpVar = nullptr;
350+
348351
// Indicates that sampler header cache (builtinSamplerHeader) is correctly
349352
// initialized with r0 contents.
350353
// Used only when vISA_cacheSamplerHeader option is set.
@@ -621,6 +624,8 @@ class IR_Builder
621624

622625
G4_Declare* getSpillFillHeader();
623626

627+
G4_Declare* getEUFusionWATmpVar();
628+
624629
G4_Declare* getOldA0Dot2Temp();
625630
bool hasValidOldA0Dot2() { return oldA0Dot2Temp; }
626631

@@ -772,6 +777,8 @@ class IR_Builder
772777

773778
G4_INST* createPseudoKill(G4_Declare* dcl, PseudoKillType ty);
774779

780+
G4_INST* createEUWASpill(bool addToInstList);
781+
775782
// numRows is in hword units
776783
// offset is in hword units
777784
G4_INST* createSpill(

visa/BuildIRImpl.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,18 @@ G4_Declare* IR_Builder::getSpillFillHeader()
710710
return spillFillHeader;
711711
}
712712

713+
G4_Declare* IR_Builder::getEUFusionWATmpVar()
714+
{
715+
if (!euFusionWATmpVar)
716+
{
717+
euFusionWATmpVar = createTempVar(2, Type_UD, Even_Word, "euFusionWATmp");
718+
euFusionWATmpVar->setLiveOut();
719+
euFusionWATmpVar->setLiveIn();
720+
euFusionWATmpVar->setDoNotSpill();
721+
}
722+
return euFusionWATmpVar;
723+
}
724+
713725
G4_Declare* IR_Builder::getOldA0Dot2Temp()
714726
{
715727
if (!oldA0Dot2Temp)
@@ -959,6 +971,22 @@ G4_INST* IR_Builder::createPseudoKill(G4_Declare* dcl, PseudoKillType ty)
959971

960972
static const unsigned int HWORD_BYTE_SIZE = 32;
961973

974+
975+
G4_INST* IR_Builder::createEUWASpill(bool addToInstList)
976+
{
977+
const RegionDesc* rd = getRegionScalar();
978+
979+
G4_Declare* dcl = getEUFusionWATmpVar();
980+
G4_SrcRegRegion* pseudoUseSrc =
981+
createSrc(dcl->getRegVar(), 0, 0, rd, Type_UD);
982+
983+
G4_INST* pseudoUseInst = createIntrinsicInst(
984+
nullptr, Intrinsic::FlagSpill, g4::SIMD2,
985+
nullptr, pseudoUseSrc, nullptr, nullptr, InstOpt_NoOpt, addToInstList);
986+
987+
return pseudoUseInst;
988+
}
989+
962990
G4_INST* IR_Builder::createSpill(
963991
G4_DstRegRegion* dst, G4_SrcRegRegion* header, G4_SrcRegRegion* payload,
964992
G4_ExecSize execSize,
@@ -971,6 +999,7 @@ G4_INST* IR_Builder::createSpill(
971999
spill->asSpillIntrinsic()->setOffset((uint32_t)
9721000
(((uint64_t)offset * HWORD_BYTE_SIZE) / numEltPerGRF<Type_UB>()));
9731001
spill->asSpillIntrinsic()->setNumRows(numRows);
1002+
9741003
return spill;
9751004
}
9761005

visa/G4_IR.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,7 @@ typedef struct _SWSBInfo
477477
bool isPseudoKill() const;
478478
bool isLifeTimeEnd() const;
479479
bool isSpillIntrinsic() const;
480+
bool isFlagSpillIntrinsic() const;
480481
G4_SpillIntrinsic* asSpillIntrinsic() const;
481482
bool isFillIntrinsic() const;
482483
G4_FillIntrinsic* asFillIntrinsic() const;
@@ -1502,6 +1503,7 @@ enum class Intrinsic
15021503
CallerRestore,
15031504
CalleeSave,
15041505
CalleeRestore,
1506+
FlagSpill,
15051507
NumIntrinsics
15061508
};
15071509

@@ -1546,6 +1548,7 @@ static const IntrinsicInfo G4_Intrinsics[(int)Intrinsic::NumIntrinsics] =
15461548
{Intrinsic::CallerRestore, "caller_restore", 0, 1, Phase::RA, { 0, 0, 0, false, false } },
15471549
{Intrinsic::CalleeSave, "callee_save", 1, 0, Phase::RA, { 0, 0, 0, false, false } },
15481550
{Intrinsic::CalleeRestore, "callee_restore", 0, 1, Phase::RA, { 0, 0, 0, false, false } },
1551+
{Intrinsic::FlagSpill, "flagSpill", 0, 1, Phase::RA, { 0, 0, 0, false, false } },
15491552
};
15501553

15511554
namespace vISA

visa/GraphColor.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9766,8 +9766,22 @@ int GlobalRA::coloringRegAlloc()
97669766
// declares and code. This currently must be done after flag/addr RA due to
97679767
// the assumption about the location of the pseudo save/restore instructions
97689768
//
9769+
bool euWADone = false;
97699770
if (hasStackCall)
97709771
{
9772+
if (builder.hasFusedEUWA() && !euWADone)
9773+
{
9774+
G4_INST* euWAInst = builder.createEUWASpill(false);
9775+
G4_BB* entryBB = (*kernel.fg.begin());
9776+
INST_LIST_ITER inst_it = entryBB->begin();
9777+
while ((*inst_it)->isLabel())
9778+
{
9779+
inst_it++;
9780+
}
9781+
entryBB->insertBefore(inst_it, euWAInst);
9782+
euWADone = true;
9783+
}
9784+
97719785
addCallerSavePseudoCode();
97729786

97739787
// Only GENX sub-graphs require callee-save code.
@@ -10174,6 +10188,19 @@ int GlobalRA::coloringRegAlloc()
1017410188
bool success = spillGRF.insertSpillFillCode(&kernel, pointsToAnalysis);
1017510189
nextSpillOffset = spillGRF.getNextOffset();
1017610190

10191+
if (builder.hasFusedEUWA() && !euWADone)
10192+
{
10193+
G4_INST * euWAInst = builder.createEUWASpill(false);
10194+
G4_BB* entryBB = (*kernel.fg.begin());
10195+
INST_LIST_ITER inst_it = entryBB->begin();
10196+
while ((*inst_it)->isLabel())
10197+
{
10198+
inst_it++;
10199+
}
10200+
entryBB->insertBefore(inst_it, euWAInst);
10201+
euWADone = true;
10202+
}
10203+
1017710204
if (builder.hasScratchSurface() && !hasStackCall &&
1017810205
(nextSpillOffset + globalScratchOffset) > SCRATCH_MSG_LIMIT)
1017910206
{
@@ -10250,7 +10277,9 @@ int GlobalRA::coloringRegAlloc()
1025010277
// it modifies IR
1025110278
regChart->dumpRegChart(std::cerr);
1025210279
}
10280+
1025310281
expandSpillFillIntrinsics(nextSpillOffset);
10282+
1025410283
if (builder.getOption(vISA_OptReport))
1025510284
{
1025610285
detectUndefinedUses(liveAnalysis, kernel);

visa/HWCaps.inc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,12 @@ SPDX-License-Identifier: MIT
779779
return (getPlatform() == XE_HP);
780780
}
781781

782+
bool hasFusedEUWA() const
783+
{
784+
return ((getuint32Option(vISA_noMaskWA) & 0x3) > 0 ||
785+
getOption(vISA_forceNoMaskWA));
786+
}
787+
782788
bool hasFusedEU() const
783789
{
784790
return (getPlatform() == GENX_TGLLP || getPlatform() == XE_HP);

visa/Optimizer.cpp

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7177,6 +7177,13 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
71777177
// some workaround for HW restrictions. We apply them here so as not to affect optimizations, RA, and scheduling
71787178
void Optimizer::HWWorkaround()
71797179
{
7180+
if ((kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM) &&
7181+
builder.getJitInfo()->spillMemUsed > 0 && builder.hasFusedEUWA())
7182+
{
7183+
// For now, do it for CM/VC. Will turn it on for all.
7184+
doNoMaskWA_postRA();
7185+
}
7186+
71807187
// Ensure the first instruction of a stack function has switch option.
71817188
if (fg.getIsStackCallFunc() &&
71827189
VISA_WA_CHECK(builder.getPWaTable(), WaThreadSwitchAfterCall) &&
@@ -10543,6 +10550,7 @@ void Optimizer::removeInstrinsics()
1054310550
for (auto bb : kernel.fg)
1054410551
{
1054510552
bb->removeIntrinsics(Intrinsic::MemFence);
10553+
bb->removeIntrinsics(Intrinsic::FlagSpill);
1054610554
}
1054710555
}
1054810556

@@ -12020,6 +12028,185 @@ void Optimizer::doNoMaskWA()
1202012028
}
1202112029
}
1202212030

12031+
// Need to apply NoMaskWA on spill. For example,
12032+
// Think of scenario that fusedMask should be off, but it is on due to the HW bug.
12033+
// Instruction with NoMask will run, and all the others do not.
12034+
//
12035+
// V77 (2GRF) spills at offset[4x32]. The following code reads V77 from spill
12036+
// location, and modifies it, and finally write the result back into offset[4xi32].
12037+
// If the code can keep the content at this location unchanged, no WA is needed;
12038+
// otherwise, we must have WA.
12039+
//
12040+
// But write at (3) will write whatever in r4 into offset[4x32], which is undefined,
12041+
// definitely not guaranteed to be the same as r1 just read from the same location.
12042+
// (Note that mul at (2) will not run because the channel enable is off [only fusedMask
12043+
// is on].) This shows the code modifies the content at offset[4x32], which is wrong.
12044+
//
12045+
// With this, the WA must be applied. It is enough to apply on spill (write) only.
12046+
//
12047+
// Before RA:
12048+
// BB1:
12049+
// mul (M1, 16) V77(0,0)<1> V141(0,0)<0;1,0> V77(0,0)<1;1,0>
12050+
// BB2:
12051+
// svm_block_st (4) V154(0,0)<0;1,0> V77.0
12052+
//
12053+
// After RA
12054+
// BB1:
12055+
// (1) // wr:1h+0, rd:2; hword scratch block read x2
12056+
// // scratch space fill: FL_GRF_V77_6 from offset[4x32]
12057+
// (W) send.dc0 (16|M0) r1 r0 null 0x0 0x022C1004
12058+
// (2) mul (16|M0) r4.0<1>:f r3.0<0;1,0>:f r1.0<8;8,1>:f
12059+
// (3) // wr:1h+2, rd:0; hword scratch block write x2
12060+
// // scratch space spill: SP_GRF_V77_3 from offset[4x32];
12061+
// (W) send.dc0 (16|M0) null r0 r4 0x80 0x020F1004
12062+
//
12063+
// Note this works only for NoMaskWA=2
12064+
//
12065+
void Optimizer::doNoMaskWA_postRA()
12066+
{
12067+
std::vector<INST_LIST_ITER> NoMaskCandidates;
12068+
G4_ExecSize simdsize = fg.getKernel()->getSimdSize();
12069+
12070+
auto isCandidate = [](G4_INST* I) {
12071+
if (I->isSend() && I->isWriteEnableInst() &&
12072+
I->getPredicate() == nullptr &&
12073+
(I->getDst() == nullptr || I->getDst()->isNullReg()))
12074+
{
12075+
// This shall be a spill (write).
12076+
// May check if the spilled var is global. We only need
12077+
// to do WA for global spill!
12078+
return true;
12079+
}
12080+
return false;
12081+
};
12082+
12083+
auto createFlagFromCmp = [&](G4_BB* BB, INST_LIST_ITER& InsertPos,
12084+
G4_RegVar* flag, unsigned flagOff, G4_Type Ty)
12085+
{
12086+
// I0: (W) mov (1|M0) flag:Ty, 0
12087+
// flagDefInst: cmp (simdsize|M0) (eq)flag r0:uw r0:uw
12088+
G4_DstRegRegion* D = builder.createDst(flag, 0, flagOff, 1, Ty);
12089+
G4_INST* I0 = builder.createMov(g4::SIMD1, D, builder.createImm(0, Ty), InstOpt_WriteEnable, false);
12090+
BB->insertBefore(InsertPos, I0);
12091+
12092+
G4_SrcRegRegion* r0_0 = builder.createSrc(
12093+
builder.getRealR0()->getRegVar(), 0, 0,
12094+
builder.getRegionScalar(), Type_UW);
12095+
G4_SrcRegRegion* r0_1 = builder.createSrc(
12096+
builder.getRealR0()->getRegVar(), 0, 0,
12097+
builder.getRegionScalar(), Type_UW);
12098+
G4_CondMod* flagCM = builder.createCondMod(Mod_e, flag, flagOff);
12099+
G4_DstRegRegion* nullDst = builder.createNullDst(Type_UW);
12100+
G4_INST* I1 = builder.createInternalInst(
12101+
NULL, G4_cmp, flagCM, g4::NOSAT, simdsize,
12102+
nullDst, r0_0, r0_1, InstOpt_M0);
12103+
BB->insertBefore(InsertPos, I1);
12104+
};
12105+
12106+
auto createMov1 = [&](G4_BB* BB, INST_LIST_ITER& InsertPos,
12107+
G4_RegVar* Dst, unsigned Dst_off, G4_RegVar* Src, unsigned Src_off, G4_Type Ty)
12108+
{
12109+
G4_DstRegRegion* D = builder.createDst(Dst, 0, Dst_off, 1, Ty);
12110+
G4_SrcRegRegion* S = builder.createSrc(Src, 0, Src_off, builder.getRegionScalar(), Ty);
12111+
G4_INST* tI = builder.createMov(g4::SIMD1, D, S, InstOpt_WriteEnable, false);
12112+
BB->insertBefore(InsertPos, tI);
12113+
};
12114+
12115+
// Assuming all flags are used, thus need to spill one.
12116+
// RA reserves two DW for this purpose:
12117+
// DW0: <original flag>
12118+
// DW1: <WA flag> // set once and reuse it in the same BB
12119+
// For example, the following spill send needs WA:
12120+
// (W) send (16|M0) ...
12121+
// Let's say we use f0.0, WA sequence is as follows:
12122+
// 1. (W) mov (1|M0) DW0:uw f0.0<0;1,0>:uw // save
12123+
// 2. (W) mov (1|M0) f0.0<1>:uw 0:uw
12124+
// 3. cmp (16|M0) (eq)f0.0 null<1>:uw r0.0<0;1,0>:uw r0.0<0;1,0>:uw
12125+
// 4. (W) mov (1|M0) DW1:uw f0.0<0;1,0>:uw // WASave
12126+
// (W & f0.0.any16h) send (16|M0) ...
12127+
// 5. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
12128+
// Note that 2,3, and 4 are needed once per BB. They are done for the first WA send.
12129+
// If there are more WA sends in the same BB, the WA send after the 1st needs to have
12130+
// 1. (W) mov (1|M0) DW0:uw f0.0<0;1,0>:uw // save
12131+
// 2. (W) mov (1|M0) f0.0<1>:uw DW1:uw // WARestore
12132+
// (W & f0.0.any16h) send (16|M0) ...
12133+
// 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
12134+
//
12135+
// Todo: check if save/restore is needed to avoid redundant save/restore.
12136+
//
12137+
G4_Declare* saveTmp = builder.getEUFusionWATmpVar(); // 2DW;
12138+
G4_RegVar* saveVar = saveTmp->getRegVar();
12139+
G4_Predicate_Control waPredCtrl =
12140+
(simdsize == 8 ? PRED_ANY8H
12141+
: (simdsize == 16 ? PRED_ANY16H : PRED_ANY32H));
12142+
unsigned saveOff = 0, waSaveOff = (simdsize == 32 ? 1 : 2);
12143+
12144+
for (auto BI : fg)
12145+
{
12146+
G4_BB* BB = BI;
12147+
if ((BB->getBBType() & G4_BB_NM_WA_TYPE) == 0)
12148+
{
12149+
continue;
12150+
}
12151+
12152+
// per-BB insts that need NoMaskWA (aka WA inst)
12153+
std::vector<INST_LIST_ITER> WAInsts;
12154+
12155+
// First collect all candidates and also check if there is
12156+
// any free flag registers
12157+
for (auto II = BB->begin(), IE = BB->end(); II != IE; ++II)
12158+
{
12159+
G4_INST* I = *II;
12160+
if (!isCandidate(I))
12161+
{
12162+
continue;
12163+
}
12164+
WAInsts.push_back(II);
12165+
}
12166+
12167+
if (WAInsts.empty())
12168+
{
12169+
continue;
12170+
}
12171+
12172+
// Without optimization, always do save/restore
12173+
bool needSave = true;
12174+
bool needRestore = true;
12175+
G4_Type Ty = (simdsize > 16) ? Type_UD : Type_UW;
12176+
G4_Declare* flagDcl = builder.createTempFlag((Ty == Type_UW ? 1 : 2), "waflag");
12177+
G4_RegVar* flagVar = flagDcl->getRegVar();
12178+
flagVar->setPhyReg(builder.phyregpool.getFlagAreg(0), 0);
12179+
12180+
// Save flag, create WA mask, save WAflag
12181+
createMov1(BB, WAInsts[0], saveVar, saveOff, flagVar, 0, Ty); // save
12182+
createFlagFromCmp(BB, WAInsts[0], flagVar, 0, Ty);
12183+
if (WAInsts.size() > 1) {
12184+
createMov1(BB, WAInsts[0], saveVar, waSaveOff, flagVar, 0, Ty); // WASave
12185+
}
12186+
12187+
for (int i = 0, sz = (int)WAInsts.size(); i < sz; ++i)
12188+
{
12189+
auto& currII = WAInsts[i];
12190+
12191+
if (i > 0 && needSave) {
12192+
createMov1(BB, currII, saveVar, saveOff, flagVar, 0, Ty); // save
12193+
createMov1(BB, currII, flagVar, 0, saveVar, waSaveOff, Ty); // WARestore
12194+
}
12195+
12196+
G4_INST* I = *currII;
12197+
G4_Predicate* newPred = builder.createPredicate(
12198+
PredState_Plus, flagVar, 0, waPredCtrl);
12199+
I->setPredicate(newPred);
12200+
12201+
if (i == (sz - 1) || needRestore) {
12202+
auto nextII = currII;
12203+
++nextII;
12204+
createMov1(BB, nextII, flagVar, 0, saveVar, saveOff, Ty); // restore
12205+
}
12206+
}
12207+
}
12208+
}
12209+
1202312210
// Convert vISA MULH dst:d src0:d src1:d into
1202412211
// mul acc0.0<1>:d src0:d src1:w
1202512212
// mach dst:d src0:d src1:d

visa/Optimizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ class Optimizer
240240
void setA0toTdrForSendc();
241241
void replaceRetWithJmpi();
242242
void doNoMaskWA();
243+
void doNoMaskWA_postRA();
243244
void insertFenceAtEntry();
244245
void expandMulPostSchedule();
245246
void expandMadwPostSchedule();

0 commit comments

Comments
 (0)