@@ -6879,7 +6879,8 @@ bool Optimizer::foldPseudoAndOr(G4_BB* bb, INST_LIST_ITER& ii)
6879
6879
void Optimizer::HWWorkaround ()
6880
6880
{
6881
6881
if ((kernel.getInt32KernelAttr (Attributes::ATTR_Target) == VISA_CM) &&
6882
- builder.getJitInfo ()->spillMemUsed > 0 && builder.hasFusedEUWA ())
6882
+ builder.hasFusedEUWA () &&
6883
+ (builder.getJitInfo ()->spillMemUsed > 0 || builder.getJitInfo ()->numFlagSpillStore > 0 ))
6883
6884
{
6884
6885
// For now, do it for CM/VC. Will turn it on for all.
6885
6886
doNoMaskWA_postRA ();
@@ -11593,6 +11594,10 @@ void Optimizer::doNoMaskWA()
11593
11594
for (auto II = BB->begin (), IE = BB->end (); II != IE; ++II)
11594
11595
{
11595
11596
G4_INST* I = *II;
11597
+
11598
+ // Mark all instruction as created by preRA to avoid re-processing postRA
11599
+ I->setCreatedPreRA (true );
11600
+
11596
11601
if (!isCandidateInst (I, fg))
11597
11602
{
11598
11603
continue ;
@@ -11779,21 +11784,48 @@ void Optimizer::doNoMaskWA()
11779
11784
// // scratch space spill: SP_GRF_V77_3 from offset[4x32];
11780
11785
// (W) send.dc0 (16|M0) null r0 r4 0x80 0x020F1004
11781
11786
//
11787
+ // For flag spill:
11788
+ // Need WA as well due to the following case:
11789
+ //
11790
+ // After RA:
11791
+ // BB_19:
11792
+ // (W) mov (1|M0) r34.8<1>:uw f0.1<0;1,0>:uw
11793
+ // ...
11794
+ // BB_21:
11795
+ // (W) mov (1|M0) f1.1<1>:uw r34.8<0;1,0>:uw
11796
+ //
11797
+ // If BB_19 should be skipped but runs due to this HW bug, r34.8 will be updated
11798
+ // with a f0.1, which is undefined value. And at BB_21, reading from r34.8 will
11799
+ // get garbage value!
11800
+ //
11782
11801
// Note this works only for NoMaskWA=2
11783
11802
//
11784
11803
void Optimizer::doNoMaskWA_postRA ()
11785
11804
{
11786
11805
std::vector<INST_LIST_ITER> NoMaskCandidates;
11787
11806
G4_ExecSize simdsize = fg.getKernel ()->getSimdSize ();
11807
+ const bool HasFlagSpill = (builder.getJitInfo ()->numFlagSpillStore > 0 );
11808
+
11809
+ auto isCandidate = [&](G4_INST* I) {
11810
+ if (I->getCreatedPreRA () || !I->isWriteEnableInst ())
11811
+ {
11812
+ return false ;
11813
+ }
11788
11814
11789
- auto isCandidate = [](G4_INST* I) {
11790
- if (I->isSend () && I->isWriteEnableInst () &&
11791
- I->getPredicate () == nullptr &&
11815
+ // If it is global flag spill or global grf spill, need to do WA.
11816
+ // For now, global checking is not available
11817
+
11818
+ // 1. flag spill
11819
+ if (HasFlagSpill &&
11820
+ I->isMov () && I->getSrc (0 ) && I->getSrc (0 )->isFlag () &&
11821
+ I->getExecSize () == g4::SIMD1 && I->getPredicate () == nullptr )
11822
+ {
11823
+ return true ;
11824
+ }
11825
+ // 2. GRF spill
11826
+ if (I->isSend () && I->getPredicate () == nullptr &&
11792
11827
(I->getDst () == nullptr || I->getDst ()->isNullReg ()))
11793
11828
{
11794
- // This shall be a spill (write).
11795
- // May check if the spilled var is global. We only need
11796
- // to do WA for global spill!
11797
11829
return true ;
11798
11830
}
11799
11831
return false ;
@@ -11851,6 +11883,16 @@ void Optimizer::doNoMaskWA_postRA()
11851
11883
// (W & f0.0.any16h) send (16|M0) ...
11852
11884
// 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
11853
11885
//
11886
+ // For flag spill, the sequence is the same as the above except for the case in which
11887
+ // the WAFlag is the same as spilled flag. For example,
11888
+ //
11889
+ // (W) mov (1|M0) r34.8<1>:uw f0.0<0;1,0>:uw
11890
+ //
11891
+ // 1. (W) mov (1|M0) DW0:uw f0.0<0;1,0>:uw // save
11892
+ // 2. (W) mov (1|M0) f0.0<1>:uw DW1:uw // WARestore
11893
+ // (W & f0.0.any16h) mov r34.8<1>:uw DW0.0<0;1,0>:uw
11894
+ // 3. (W) mov (1|M0) f0.0<1>:uw DW0:uw // restore
11895
+ //
11854
11896
// Todo: check if save/restore is needed to avoid redundant save/restore.
11855
11897
//
11856
11898
G4_Declare* saveTmp = builder.getEUFusionWATmpVar (); // 2DW;
@@ -11891,10 +11933,15 @@ void Optimizer::doNoMaskWA_postRA()
11891
11933
// Without optimization, always do save/restore
11892
11934
bool needSave = true ;
11893
11935
bool needRestore = true ;
11936
+
11937
+ // wa flag register to use f(wafregnum, wafsregnum)
11938
+ uint32_t wafregnum = 0 ;
11939
+ uint32_t wafsregnum = 0 ;
11940
+
11894
11941
G4_Type Ty = (simdsize > 16 ) ? Type_UD : Type_UW;
11895
11942
G4_Declare* flagDcl = builder.createTempFlag ((Ty == Type_UW ? 1 : 2 ), " waflag" );
11896
11943
G4_RegVar* flagVar = flagDcl->getRegVar ();
11897
- flagVar->setPhyReg (builder.phyregpool .getFlagAreg (0 ), 0 );
11944
+ flagVar->setPhyReg (builder.phyregpool .getFlagAreg (wafregnum ), wafsregnum );
11898
11945
11899
11946
// Save flag, create WA mask, save WAflag
11900
11947
createMov1 (BB, WAInsts[0 ], saveVar, saveOff, flagVar, 0 , Ty); // save
@@ -11915,6 +11962,24 @@ void Optimizer::doNoMaskWA_postRA()
11915
11962
G4_INST* I = *currII;
11916
11963
G4_Predicate* newPred = builder.createPredicate (
11917
11964
PredState_Plus, flagVar, 0 , waPredCtrl);
11965
+ if (I->isMov () && I->getSrc (0 ) && I->getSrc (0 )->isFlag ())
11966
+ {
11967
+ G4_SrcRegRegion* srcReg = I->getSrc (0 )->asSrcRegRegion ();
11968
+ G4_RegVar* baseVar = static_cast <G4_RegVar*>(srcReg->getBase ());
11969
+ assert (baseVar->isPhyRegAssigned ());
11970
+
11971
+ // For flag, G4_Areg has flag number and G4_RegVar has subRefOff.
11972
+ // (SrcRegRegion's refOff/subRefOff is 0/0 always.)
11973
+ G4_Areg* flagReg = baseVar->getPhyReg ()->getAreg ();
11974
+ uint32_t subRegOff = baseVar->getPhyRegOff ();
11975
+ if (flagReg->getFlagNum () == wafregnum &&
11976
+ (Ty == Type_UD /* 32bit flag */ || subRegOff == wafsregnum /* 16bit flag */ ))
11977
+ {
11978
+ G4_SrcRegRegion* S = builder.createSrc (
11979
+ saveVar, 0 , saveOff, builder.getRegionScalar (), Ty);
11980
+ I->setSrc (S, 0 );
11981
+ }
11982
+ }
11918
11983
I->setPredicate (newPred);
11919
11984
11920
11985
if (i == (sz - 1 ) || needRestore) {
0 commit comments