Skip to content

Commit 41e0f52

Browse files
jgu222igcbot
authored andcommitted
call ws for partial hw fix
fused call has a partial HW fix, which needs to remove predicate call. Also any indirect call might have different targets for bigEu and SmallEu, therefore, needs to have two calls, one for bigEu and the other for smallEU. The new code is off now and will be turned it on in next submit.
1 parent b299816 commit 41e0f52

File tree

4 files changed

+253
-24
lines changed

4 files changed

+253
-24
lines changed

visa/FlowGraph.cpp

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,8 @@ bool FlowGraph::matchBranch(int &sn, INST_LIST& instlist, INST_LIST_ITER &it)
606606
//
607607
void FlowGraph::preprocess(INST_LIST& instlist)
608608
{
609+
// It is easier to add WA before control-flow is build for partial HW
610+
// fix to fused call HW bug.
609611

610612
std::unordered_set<G4_Label*> labels; // label inst we have seen so far
611613

@@ -989,6 +991,12 @@ void FlowGraph::constructFlowGraph(INST_LIST& instlist)
989991

990992
pKernel->dumpToFile("after.CFGConstruction");
991993

994+
// Do call WA before return/exit processing.
995+
if (builder->hasFusedEU() && builder->getuint32Option(vISA_fusedCallWA) == 2)
996+
{
997+
hasGoto |= convertPredCall(labelMap);
998+
}
999+
9921000
removeRedundantLabels();
9931001

9941002
pKernel->dumpToFile("after.RemoveRedundantLabels");
@@ -4880,6 +4888,137 @@ bool FlowGraph::convertJmpiToGoto()
48804888
return Changed;
48814889
}
48824890

4891+
// Changes a predicated call to a unpredicated call like the following:
4892+
// BB:
4893+
// (p) call (execSize) ...
4894+
// NextBB:
4895+
//
4896+
// It is changed to
4897+
// BB:
4898+
// p1 = simdsize > execSize ? (p & ((1<<execSize) - 1)) : p
4899+
// (~p1) goto (simdsize) target_lbl
4900+
// newCallBB:
4901+
// call (execSize) ...
4902+
// newNextBB:
4903+
// NextBB:
4904+
// where target_lbl is newTargetBB.
4905+
//
4906+
bool FlowGraph::convertPredCall(std::unordered_map<G4_Label*, G4_BB*>& aLabelMap)
4907+
{
4908+
bool changed = false;
4909+
// Add BB0 and BB1 into the subroutine in which aAnchorBB is.
4910+
auto updateSubroutineTab = [&](G4_BB* aAnchorBB, G4_BB* BB0, G4_BB* BB1)
4911+
{
4912+
for (auto MI : subroutines)
4913+
{
4914+
std::vector<G4_BB*>& bblists = MI.second;
4915+
auto BI = std::find(bblists.begin(), bblists.end(), aAnchorBB);
4916+
if (BI == bblists.end())
4917+
{
4918+
continue;
4919+
}
4920+
bblists.push_back(BB0);
4921+
bblists.push_back(BB1);
4922+
}
4923+
};
4924+
4925+
const G4_ExecSize SimdSize = pKernel->getSimdSize();
4926+
G4_Type PredTy = SimdSize > 16 ? Type_UD : Type_UW;
4927+
auto NextBI = BBs.begin();
4928+
for (auto BI = NextBI, BE = BBs.end(); BI != BE; BI = NextBI)
4929+
{
4930+
++NextBI;
4931+
G4_BB* BB = *BI;
4932+
if (BB->empty())
4933+
{
4934+
continue;
4935+
}
4936+
G4_BB* NextBB = (NextBI == BE ? nullptr : *NextBI);
4937+
4938+
G4_INST* Inst = BB->back();
4939+
G4_Predicate* Pred = Inst->getPredicate();
4940+
if (!(Pred && (Inst->isCall() || Inst->isFCall())))
4941+
{
4942+
continue;
4943+
}
4944+
4945+
const bool hasFallThru = (NextBB && BB->Succs.size() >= 1 && BB->Succs.front() == NextBB);
4946+
4947+
G4_BB* newCallBB = createNewBBWithLabel("predCallWA", Inst->getLineNo(), Inst->getCISAOff());
4948+
insert(NextBI, newCallBB);
4949+
G4_Label* callBB_lbl = newCallBB->getLabel();
4950+
assert(callBB_lbl);
4951+
aLabelMap.insert(std::make_pair(callBB_lbl, newCallBB));
4952+
4953+
G4_BB* targetBB = createNewBBWithLabel("predCallWA", Inst->getLineNo(), Inst->getCISAOff());
4954+
insert(NextBI, targetBB);
4955+
G4_Label* target_lbl = targetBB->getLabel();
4956+
aLabelMap.insert(std::make_pair(target_lbl, targetBB));
4957+
4958+
// relink call's succs
4959+
if (hasFallThru)
4960+
{
4961+
removePredSuccEdges(BB, NextBB);
4962+
}
4963+
auto SI = BB->Succs.begin(), SE = BB->Succs.end();
4964+
while (SI != SE)
4965+
{
4966+
G4_BB* Succ = *SI;
4967+
++SI;
4968+
removePredSuccEdges(BB, Succ);
4969+
addPredSuccEdges(newCallBB, Succ, false);
4970+
}
4971+
addPredSuccEdges(BB, newCallBB, true);
4972+
addPredSuccEdges(BB, targetBB, false);
4973+
addPredSuccEdges(newCallBB, targetBB, true);
4974+
if (hasFallThru)
4975+
{
4976+
addPredSuccEdges(targetBB, NextBB, true);
4977+
}
4978+
4979+
// delink call inst
4980+
BB->pop_back();
4981+
4982+
G4_Predicate* newPred;
4983+
const G4_ExecSize ExecSize = Inst->getExecSize();
4984+
if (SimdSize == ExecSize)
4985+
{
4986+
// negate predicate
4987+
newPred = builder->createPredicate(
4988+
Pred->getState() == PredState_Plus ? PredState_Minus : PredState_Plus,
4989+
Pred->getBase()->asRegVar(), 0, Pred->getControl());
4990+
}
4991+
else
4992+
{
4993+
// Common dst and src0 operand for flag.
4994+
G4_Type oldPredTy = ExecSize > 16 ? Type_UD : Type_UW;
4995+
G4_Declare* newDcl = builder->createTempFlag(PredTy == Type_UD ? 2 : 1);
4996+
auto pDst = builder->createDst(newDcl->getRegVar(), 0, 0, 1, PredTy);
4997+
auto pSrc0 = builder->createSrc(Pred->getBase(), 0, 0, builder->getRegionScalar(), oldPredTy);
4998+
auto pSrc1 = builder->createImm((1 << ExecSize) - 1, PredTy);
4999+
auto pInst = builder->createBinOp(
5000+
G4_and, g4::SIMD1, pDst, pSrc0, pSrc1,
5001+
InstOpt_M0 | InstOpt_WriteEnable, false);
5002+
BB->push_back(pInst);
5003+
5004+
newPred = builder->createPredicate(
5005+
Pred->getState() == PredState_Plus ? PredState_Minus : PredState_Plus,
5006+
newDcl->getRegVar(), 0, Pred->getControl());
5007+
}
5008+
5009+
G4_INST* gotoInst = builder->createGoto(newPred, SimdSize, target_lbl, InstOpt_NoOpt, false);
5010+
BB->push_back(gotoInst);
5011+
5012+
Inst->setPredicate(nullptr);
5013+
newCallBB->push_back(Inst);
5014+
5015+
updateSubroutineTab(BB, newCallBB, targetBB);
5016+
changed = true;
5017+
}
5018+
// if changed is true, it means goto has been added.
5019+
return changed;
5020+
}
5021+
48835022
void FlowGraph::print(std::ostream& OS) const
48845023
{
48855024
const char* kname = nullptr;

visa/FlowGraph.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ class FlowGraph
384384
void convertGotoToJmpi(G4_INST *gotoInst);
385385
G4_BB* getSinglePredecessor(G4_BB* BB, G4_BB* ExcludedPred) const;
386386
bool convertJmpiToGoto();
387+
bool convertPredCall(std::unordered_map<G4_Label*, G4_BB*>& aLabelMap); // for WA
387388

388389
unsigned getNumFuncs() const {return unsigned(funcInfoTable.size());}
389390

visa/Optimizer.cpp

Lines changed: 111 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7599,8 +7599,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
75997599
}
76007600

76017601
// Call WA for fused EU
7602-
if (builder.hasFusedEU() && builder.getuint32Option(vISA_fusedCallWA) == 1 &&
7603-
kernel.hasIndirectCall())
7602+
if (builder.hasFusedEU() && kernel.hasIndirectCall())
76047603
{
76057604
applyFusedCallWA();
76067605
}
@@ -16587,30 +16586,64 @@ void Optimizer::doNoMaskWA_postRA()
1658716586
}
1658816587
}
1658916588

16590-
// Assumption:
16589+
// Summary:
1659116590
// vISA assumes the call's target would be uniform within a thread. This is consistent with
16592-
// hardware call instructions. Under EU fusion, assume that an indirect call invokes A
16593-
// in thread 0 and invokes B in thread 1, which isn't supported.
16591+
// hardware call instructions. Under EU fusion, a pair of fused thread 0 and 1 might diverge,
16592+
// meaning that an indirect call invokes A in thread 0 and invokes B in thread 1, which isn't
16593+
// supported by fused EU hardware.
1659416594
//
16595-
// This function does two things:
16596-
// 1. For any indirect call like the following:
16597-
// call
16598-
// changed it to:
16595+
// This function is used to make sure each fused call will have a single target. As there are HW bugs
16596+
// in fused calls, this function will WA HW bugs as well. The general idea is:
16597+
// Given:
16598+
// (p) call r5
16599+
// Changed it to:
1659916600
// if (BigEU)
16600-
// call
16601+
// (p) call r5
1660116602
// else // SmallEU
16602-
// call
16603-
// 2. HW has a bug in which call always runs and it always uses BigEU's target as targets for both EUs.
16604-
// This causes several issues and the WA is used to fix this harware bug.
16603+
// (p) call r5
1660516604
//
16606-
// Details of 2
16605+
// As HW has a bug in which call always runs (even no active channels) and it always uses BigEU's target
16606+
// as targets for both EUs. This causes several issues and the software WA is used to fix this harware bug.
16607+
// There are several cases:
16608+
// 1. For platforms that has NO HW fix (fusedCallWA 1), applying software WA as described
16609+
// below in "Details of 1",
16610+
//
16611+
// 2. For platforms that has the PARTIAL HW fix (fusedCallWA 2)
16612+
// Any predicated call must be changed to unpredicated like the following:
16613+
// (p) call ...
16614+
// changed to
16615+
// if (p)
16616+
// call ...
16617+
//
16618+
// This is done in Flowgraph::convertPredCall(), right after control-flow is
16619+
// constructed.
16620+
//
16621+
// 2.1 for direct call like the following
16622+
// (p) call r5
16623+
//
16624+
// if (p)
16625+
// if (BigEU) // BigEU
16626+
// call r5
16627+
// else // SmallEU
16628+
// call r5
16629+
// 3. For platforms that have a full fix (if any) (fusedCallWA 0),
16630+
// just do the following for indirect call.
16631+
// (p) call r5
16632+
// if (BigEU) // BigEU
16633+
// (p) call r5
16634+
// else // SmallEU
16635+
// (p) call r5
16636+
//
16637+
// This function handles 1) and duplicating call for BigEU and SmallEU.
16638+
//
16639+
// Details of 1
1660716640
// ============
1660816641
// Under EU fusion, assume that an indirect call invokes A in thread 0 and invokes B in thread 1.
1660916642
// Assume that these two threads are fused and run on a pair of fused EUs {bigEU, smallEU}. The hardware
16610-
// will always invoke A: the callee from thread 0 in bigEU even in else (smallEU) barnch, which is
16611-
// incorrect. To workaround this bug, we have to rely on the fact that cr0.2 is shared among the pair
16612-
// of fused EUs and copy thread 1's callee into thread 0 via cr0.2. In doing so, thread 1's callee
16613-
// can be invoked. The details are as follows:
16643+
// will always invoke A: the callee from thread 0 in bigEU even in else branch (in general case),
16644+
// which is incorrect. To workaround this bug, we have to rely on the fact that cr0.2 is shared among
16645+
// the pair of fused EUs and copy thread 1's callee B into thread 0 via cr0.2. In doing so, thread 1's
16646+
// callee can be invoked. The details are as follows:
1661416647
//
1661516648
// before:
1661616649
// BB:
@@ -16644,13 +16677,15 @@ void Optimizer::doNoMaskWA_postRA()
1664416677
// join <nextJoin or null> // finalJoin
1664516678
//
1664616679
// The BBs and those insts such as I4_patch_add/I5_patch_add, etc are added into m_indirectCallWAInfo
16647-
// so that finishFusedCallWA() can finish post-processing to patch the relative IP and others.
16680+
// so that finishFusedCallWA() can finish post-processing to patch the relative IP and others. If calla
16681+
// can be used, no IP patching is needed. See code for details.
1664816682
//
16649-
// Note that there is another hardware bug. If BigEU is off, the mov instruction
16683+
// In order to make the following to run always even through bigEU is off,
1665016684
// "(W) mov (1 |M0) smallEUTarget:ud cr0.2<0;1,0>:ud"
16651-
// will not run, thus BigEU will not have smallEU's target. Since this WA requires
16652-
// the above move must be run, a special maskOff (M16) must be used to force NoMask to run
16653-
// no matter if the EU is off or on. This will be handled in finishFusedCallWA().
16685+
// a special maskOff (M16) must be used to force NoMask to run no matter if the EU is off or on.
16686+
// This will be handled in finishFusedCallWA().
16687+
// (See details in finishFusedCallWA(). To make it work, any kernel with indirect call is required
16688+
// to be simd16 or simd8, not simd32, so that M16 can be used to force running the inst always.)
1665416689
//
1665516690
void Optimizer::applyFusedCallWA()
1665616691
{
@@ -16707,6 +16742,12 @@ void Optimizer::applyFusedCallWA()
1670716742
}
1670816743
};
1670916744

16745+
// Only process call wa (fusedCallWA = 1) or indirect call is non-uniform
16746+
if (!((builder.getuint32Option(vISA_fusedCallWA) == 1) || !builder.getOption(vISA_fusedCallUniform)))
16747+
{
16748+
return;
16749+
}
16750+
1671016751
for (BB_LIST_ITER BI = fg.begin(), BE = fg.end(); BI != BE;)
1671116752
{
1671216753
BB_LIST_ITER currBI = BI;
@@ -16837,6 +16878,53 @@ void Optimizer::applyFusedCallWA()
1683716878
simdsz > g4::SIMD16 ? g4::SIMD32 : g4::SIMD16,
1683816879
builder.createNullDst(Type_UW), I1_Src0, Bit7, InstOpt_WriteEnable);
1683916880

16881+
if (builder.getuint32Option(vISA_fusedCallWA) != 1)
16882+
{
16883+
assert(!builder.getOption(vISA_fusedCallUniform));
16884+
// Just need to duplicate the call so that one is called under BigEU,
16885+
// and the other is under SmallEU.
16886+
16887+
BB->pop_back(); // unlink the call inst from BB
16888+
BB->push_back(I0);
16889+
BB->push_back(I1);
16890+
16891+
I0->addDefUse(I1, Opnd_src0);
16892+
16893+
G4_Predicate* pred_m1 = builder.createPredicate(PredState_Minus, F->getRegVar(), 0);
16894+
G4_INST* gotoSmallB0 = builder.createCFInst(pred_m1, G4_goto, simdsz, smallB0Label, smallB0Label, InstOpt_NoOpt, false);
16895+
BB->push_back(gotoSmallB0);
16896+
I1->addDefUse(gotoSmallB0, Opnd_pred);
16897+
16898+
G4_Predicate* nPred(callI->getPredicate());
16899+
G4_SrcRegRegion* nSrc = builder.createSrc(Target->getBase(), 0, 0, builder.getRegionScalar(), Type_UD);
16900+
G4_INST* nCallI = builder.createInternalInst(nPred, callI->opcode(),
16901+
nullptr, g4::NOSAT, callI->getExecSize(), nullptr, nSrc, nullptr, callI->getOption());
16902+
(void)bigB0->push_back(callI);
16903+
(void)smallB0->push_back(nCallI);
16904+
16905+
// Need to create fcall info
16906+
if (G4_FCALL* orig_fcallinfo = builder.getFcallInfo(callI))
16907+
{
16908+
builder.addFcallInfo(nCallI, orig_fcallinfo->getArgSize(), orig_fcallinfo->getRetSize());
16909+
}
16910+
// Might need to update subroutine table
16911+
updateSubroutineTableIfNeeded(origNextBB, bigB0, bigB1, smallB0, smallB1, newNextBB);
16912+
16913+
if (!fg.globalOpndHT.isOpndGlobal(Target))
16914+
{
16915+
callI->removeDefUse(Opnd_src0);
16916+
}
16917+
fg.globalOpndHT.addGlobalOpnd(Target);
16918+
fg.globalOpndHT.addGlobalOpnd(nSrc);
16919+
16920+
// done with this indirect call.
16921+
continue;
16922+
}
16923+
16924+
//
16925+
// main call WA under fusedCallWA = 1
16926+
//
16927+
1684016928
// I2: (!flag) mov cr0.2 callee
1684116929
G4_VarBase* V_cr0 = builder.phyregpool.getCr0Reg();
1684216930
G4_DstRegRegion* I2_Dst = builder.createDst(V_cr0, 0, 2, 1, Type_UD);

visa/include/VISAOptionsDefs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,8 @@ DEF_VISA_OPTION(vISA_forceNoMaskWA, ET_BOOL, "-forceNoMaskWA",
343343
DEF_VISA_OPTION(vISA_newNoMaskWA, ET_BOOL, "-newNoMaskWA", "Temp just for VC", true)
344344
DEF_VISA_OPTION(vISA_noMaskWAOnFuncEntry, ET_BOOL, "-noMaskWAOnFuncEntry", UNUSED, true)
345345
DEF_VISA_OPTION(vISA_newTmpNoMaskWA, ET_INT32, "-newTmpNoMaskWA", "to control scalar IGC, temporary -newTempNoMaskWA 0|1|2", 0)
346-
DEF_VISA_OPTION(vISA_fusedCallWA, ET_INT32, "-fusedCallWA", "EU Fusion call ww: 0: no wa, 1: sw wa w/o hw fix; 2: sw wa with partial HW fix", 0)
346+
DEF_VISA_OPTION(vISA_fusedCallWA, ET_INT32, "-fusedCallWA", "EU Fusion call wa: 0: no hw bug, 1: sw wa w/o hw fix; 2: sw wa with partial HW fix", 0)
347+
DEF_VISA_OPTION(vISA_fusedCallUniform, ET_BOOL, "-fusedCallUniform", "true: fused call is uniform; false otherwise.", false)
347348
DEF_VISA_OPTION(vISA_DstSrcOverlapWA, ET_BOOL, "-dstSrcOverlapWA", UNUSED, true)
348349
DEF_VISA_OPTION(vISA_Src1Src2OverlapWA, ET_BOOL, "-src1Src2OverlapWA", UNUSED, false)
349350
DEF_VISA_OPTION(vISA_noSendSrcDstOverlap, ET_BOOL, "-noSendSrcDstOverlap", UNUSED, false)

0 commit comments

Comments
 (0)