@@ -7599,8 +7599,7 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
7599
7599
}
7600
7600
7601
7601
// Call WA for fused EU
7602
- if (builder.hasFusedEU() && builder.getuint32Option(vISA_fusedCallWA) == 1 &&
7603
- kernel.hasIndirectCall())
7602
+ if (builder.hasFusedEU() && kernel.hasIndirectCall())
7604
7603
{
7605
7604
applyFusedCallWA();
7606
7605
}
@@ -16587,30 +16586,64 @@ void Optimizer::doNoMaskWA_postRA()
16587
16586
}
16588
16587
}
16589
16588
16590
- // Assumption :
16589
+ // Summary :
16591
16590
// vISA assumes the call's target would be uniform within a thread. This is consistent with
16592
- // hardware call instructions. Under EU fusion, assume that an indirect call invokes A
16593
- // in thread 0 and invokes B in thread 1, which isn't supported.
16591
+ // hardware call instructions. Under EU fusion, a pair of fused thread 0 and 1 might diverge,
16592
+ // meaning that an indirect call invokes A in thread 0 and invokes B in thread 1, which isn't
16593
+ // supported by fused EU hardware.
16594
16594
//
16595
- // This function does two things:
16596
- // 1. For any indirect call like the following:
16597
- // call
16598
- // changed it to:
16595
+ // This function is used to make sure each fused call will have a single target. As there are HW bugs
16596
+ // in fused calls, this function will WA HW bugs as well. The general idea is:
16597
+ // Given:
16598
+ // (p) call r5
16599
+ // Changed it to:
16599
16600
// if (BigEU)
16600
- // call
16601
+ // (p) call r5
16601
16602
// else // SmallEU
16602
- // call
16603
- // 2. HW has a bug in which call always runs and it always uses BigEU's target as targets for both EUs.
16604
- // This causes several issues and the WA is used to fix this harware bug.
16603
+ // (p) call r5
16605
16604
//
16606
- // Details of 2
16605
+ // As HW has a bug in which call always runs (even no active channels) and it always uses BigEU's target
16606
+ // as targets for both EUs. This causes several issues and the software WA is used to fix this harware bug.
16607
+ // There are several cases:
16608
+ // 1. For platforms that has NO HW fix (fusedCallWA 1), applying software WA as described
16609
+ // below in "Details of 1",
16610
+ //
16611
+ // 2. For platforms that has the PARTIAL HW fix (fusedCallWA 2)
16612
+ // Any predicated call must be changed to unpredicated like the following:
16613
+ // (p) call ...
16614
+ // changed to
16615
+ // if (p)
16616
+ // call ...
16617
+ //
16618
+ // This is done in Flowgraph::convertPredCall(), right after control-flow is
16619
+ // constructed.
16620
+ //
16621
+ // 2.1 for direct call like the following
16622
+ // (p) call r5
16623
+ //
16624
+ // if (p)
16625
+ // if (BigEU) // BigEU
16626
+ // call r5
16627
+ // else // SmallEU
16628
+ // call r5
16629
+ // 3. For platforms that have a full fix (if any) (fusedCallWA 0),
16630
+ // just do the following for indirect call.
16631
+ // (p) call r5
16632
+ // if (BigEU) // BigEU
16633
+ // (p) call r5
16634
+ // else // SmallEU
16635
+ // (p) call r5
16636
+ //
16637
+ // This function handles 1) and duplicating call for BigEU and SmallEU.
16638
+ //
16639
+ // Details of 1
16607
16640
// ============
16608
16641
// Under EU fusion, assume that an indirect call invokes A in thread 0 and invokes B in thread 1.
16609
16642
// Assume that these two threads are fused and run on a pair of fused EUs {bigEU, smallEU}. The hardware
16610
- // will always invoke A: the callee from thread 0 in bigEU even in else (smallEU) barnch, which is
16611
- // incorrect. To workaround this bug, we have to rely on the fact that cr0.2 is shared among the pair
16612
- // of fused EUs and copy thread 1's callee into thread 0 via cr0.2. In doing so, thread 1's callee
16613
- // can be invoked. The details are as follows:
16643
+ // will always invoke A: the callee from thread 0 in bigEU even in else branch (in general case),
16644
+ // which is incorrect. To workaround this bug, we have to rely on the fact that cr0.2 is shared among
16645
+ // the pair of fused EUs and copy thread 1's callee B into thread 0 via cr0.2. In doing so, thread 1's
16646
+ // callee can be invoked. The details are as follows:
16614
16647
//
16615
16648
// before:
16616
16649
// BB:
@@ -16644,13 +16677,15 @@ void Optimizer::doNoMaskWA_postRA()
16644
16677
// join <nextJoin or null> // finalJoin
16645
16678
//
16646
16679
// The BBs and those insts such as I4_patch_add/I5_patch_add, etc are added into m_indirectCallWAInfo
16647
- // so that finishFusedCallWA() can finish post-processing to patch the relative IP and others.
16680
+ // so that finishFusedCallWA() can finish post-processing to patch the relative IP and others. If calla
16681
+ // can be used, no IP patching is needed. See code for details.
16648
16682
//
16649
- // Note that there is another hardware bug. If BigEU is off, the mov instruction
16683
+ // In order to make the following to run always even through bigEU is off,
16650
16684
// "(W) mov (1 |M0) smallEUTarget:ud cr0.2<0;1,0>:ud"
16651
- // will not run, thus BigEU will not have smallEU's target. Since this WA requires
16652
- // the above move must be run, a special maskOff (M16) must be used to force NoMask to run
16653
- // no matter if the EU is off or on. This will be handled in finishFusedCallWA().
16685
+ // a special maskOff (M16) must be used to force NoMask to run no matter if the EU is off or on.
16686
+ // This will be handled in finishFusedCallWA().
16687
+ // (See details in finishFusedCallWA(). To make it work, any kernel with indirect call is required
16688
+ // to be simd16 or simd8, not simd32, so that M16 can be used to force running the inst always.)
16654
16689
//
16655
16690
void Optimizer::applyFusedCallWA()
16656
16691
{
@@ -16707,6 +16742,12 @@ void Optimizer::applyFusedCallWA()
16707
16742
}
16708
16743
};
16709
16744
16745
+ // Only process call wa (fusedCallWA = 1) or indirect call is non-uniform
16746
+ if (!((builder.getuint32Option(vISA_fusedCallWA) == 1) || !builder.getOption(vISA_fusedCallUniform)))
16747
+ {
16748
+ return;
16749
+ }
16750
+
16710
16751
for (BB_LIST_ITER BI = fg.begin(), BE = fg.end(); BI != BE;)
16711
16752
{
16712
16753
BB_LIST_ITER currBI = BI;
@@ -16837,6 +16878,53 @@ void Optimizer::applyFusedCallWA()
16837
16878
simdsz > g4::SIMD16 ? g4::SIMD32 : g4::SIMD16,
16838
16879
builder.createNullDst(Type_UW), I1_Src0, Bit7, InstOpt_WriteEnable);
16839
16880
16881
+ if (builder.getuint32Option(vISA_fusedCallWA) != 1)
16882
+ {
16883
+ assert(!builder.getOption(vISA_fusedCallUniform));
16884
+ // Just need to duplicate the call so that one is called under BigEU,
16885
+ // and the other is under SmallEU.
16886
+
16887
+ BB->pop_back(); // unlink the call inst from BB
16888
+ BB->push_back(I0);
16889
+ BB->push_back(I1);
16890
+
16891
+ I0->addDefUse(I1, Opnd_src0);
16892
+
16893
+ G4_Predicate* pred_m1 = builder.createPredicate(PredState_Minus, F->getRegVar(), 0);
16894
+ G4_INST* gotoSmallB0 = builder.createCFInst(pred_m1, G4_goto, simdsz, smallB0Label, smallB0Label, InstOpt_NoOpt, false);
16895
+ BB->push_back(gotoSmallB0);
16896
+ I1->addDefUse(gotoSmallB0, Opnd_pred);
16897
+
16898
+ G4_Predicate* nPred(callI->getPredicate());
16899
+ G4_SrcRegRegion* nSrc = builder.createSrc(Target->getBase(), 0, 0, builder.getRegionScalar(), Type_UD);
16900
+ G4_INST* nCallI = builder.createInternalInst(nPred, callI->opcode(),
16901
+ nullptr, g4::NOSAT, callI->getExecSize(), nullptr, nSrc, nullptr, callI->getOption());
16902
+ (void)bigB0->push_back(callI);
16903
+ (void)smallB0->push_back(nCallI);
16904
+
16905
+ // Need to create fcall info
16906
+ if (G4_FCALL* orig_fcallinfo = builder.getFcallInfo(callI))
16907
+ {
16908
+ builder.addFcallInfo(nCallI, orig_fcallinfo->getArgSize(), orig_fcallinfo->getRetSize());
16909
+ }
16910
+ // Might need to update subroutine table
16911
+ updateSubroutineTableIfNeeded(origNextBB, bigB0, bigB1, smallB0, smallB1, newNextBB);
16912
+
16913
+ if (!fg.globalOpndHT.isOpndGlobal(Target))
16914
+ {
16915
+ callI->removeDefUse(Opnd_src0);
16916
+ }
16917
+ fg.globalOpndHT.addGlobalOpnd(Target);
16918
+ fg.globalOpndHT.addGlobalOpnd(nSrc);
16919
+
16920
+ // done with this indirect call.
16921
+ continue;
16922
+ }
16923
+
16924
+ //
16925
+ // main call WA under fusedCallWA = 1
16926
+ //
16927
+
16840
16928
// I2: (!flag) mov cr0.2 callee
16841
16929
G4_VarBase* V_cr0 = builder.phyregpool.getCr0Reg();
16842
16930
G4_DstRegRegion* I2_Dst = builder.createDst(V_cr0, 0, 2, 1, Type_UD);
0 commit comments