@@ -10581,7 +10581,7 @@ void Optimizer::doNoMaskWA()
10581
10581
// flagVar : emask for this BB.
10582
10582
// Note that if 32-bit flag is used, flagVar and this instruction I's condMod
10583
10583
// take two flag registers, leaving no flag for temporary. In this case, we
10584
- // will do manual spill, ie, save and restore the original flag (case 1 and 3).
10584
+ // will do manual spill, ie, save and restore the original flag (case 1.2 and 3).
10585
10585
//
10586
10586
// Before:
10587
10587
// I: (W) cmp (16|M16) (ne)P D .... // 32-bit flag
@@ -10590,11 +10590,15 @@ void Optimizer::doNoMaskWA()
10590
10590
//
10591
10591
// After:
10592
10592
// (1) D = null (common)
10593
- // I0: (W) mov (1|M0) save:ud P<0;1,0>:ud
10594
- // I: (W) cmp (16|M16) (ne)P ....
10595
- // I1: (W&-flagVar) mov (1|M0) P save:ud
10593
+ // 1.1) Not simd32 And P is 16-bit modifier (less chance to have flag spill)
10594
+ // I: (W) cmp (16|M0) (ne)nP ....
10595
+ // I0: (W&flagVar) mov (1|M0) P nP
10596
+ // 1.2 general case (save flag into grf to avoid flag spill)
10597
+ // I0: (W) mov (1|M0) save:ud P<0;1,0>:ud
10598
+ // I: (W) cmp (16|M16) (ne)P ....
10599
+ // I1: (W&-flagVar) mov (1|M0) P save:ud
10596
10600
// (2) 'I' uses 16-bit flag (common)
10597
- // I0: (W) mov (1) nP<1>:uw flagVar.0 <0;1,0>:uw
10601
+ // I0: (W) mov (1) nP<1>:uw flagVar<0;1,0>:uw
10598
10602
// I: (W&nP) cmp (16|M0) (ne)nP ....
10599
10603
// I1: (W&flagVar) mov (1|M0) P<1>:uw nP<0;1,0>:uw
10600
10604
// (3) otherwise(less common)
@@ -10625,34 +10629,64 @@ void Optimizer::doNoMaskWA()
10625
10629
G4_Type Ty = (modDcl->getWordSize () > 1 ) ? Type_UD : Type_UW;
10626
10630
if (I->hasNULLDst ())
10627
10631
{ // case 1
10628
- G4_Declare* saveDecl = builder.createTempVar (1 , Ty, Any, " saveTmp" );
10629
- G4_RegVar* saveVar = saveDecl->getRegVar ();
10630
- G4_SrcRegRegion* I0S0 = builder.createSrc (
10631
- modDcl->getRegVar (),
10632
- 0 , 0 , builder.getRegionScalar (), Ty);
10633
- G4_DstRegRegion* D0 = builder.createDst (saveVar, 0 , 0 , 1 , Ty);
10634
- G4_INST* I0 = builder.createMov (g4::SIMD1, D0, I0S0, InstOpt_WriteEnable, false );
10635
- currBB->insertBefore (currII, I0);
10632
+ if (flagVar->getDeclare ()->getTotalElems () == 1 && Ty == Type_UW)
10633
+ { // case 1.1
10634
+ assert (I->getExecSize () != g4::SIMD32);
10636
10635
10637
- auto nextII = currII;
10638
- ++nextII;
10639
- G4_SrcRegRegion* I1S0 = builder.createSrc (saveVar,
10640
- 0 , 0 , builder.getRegionScalar (), Ty);
10641
- G4_DstRegRegion* D1 = builder.createDst (
10642
- modDcl->getRegVar (), 0 , 0 , 1 , Ty);
10643
- G4_INST* I1 = builder.createMov (g4::SIMD1, D1, I1S0, InstOpt_WriteEnable, false );
10644
- G4_Predicate* flag = builder.createPredicate (
10645
- PredState_Minus, flagVar, 0 , getPredCtrl (useAnyh));
10646
- I1->setPredicate (flag);
10647
- currBB->insertBefore (nextII, I1);
10636
+ // Use 16-bit flag
10637
+ G4_Declare* nPDecl = builder.createTempFlag (1 , " nP" );
10638
+ G4_RegVar* nPVar = nPDecl->getRegVar ();
10648
10639
10649
- flagVarDefInst-> addDefUse (I1, Opnd_pred );
10650
- I0-> addDefUse (I1, Opnd_src0 );
10640
+ G4_CondMod* nM = builder. createCondMod (P-> getMod (), nPVar, 0 );
10641
+ I-> setCondMod (nM );
10651
10642
10652
- if (!condModGlb)
10653
- {
10654
- // Copy condMod uses to I1.
10655
- I->copyUsesTo (I1, false );
10643
+ auto nextII = currII;
10644
+ ++nextII;
10645
+
10646
+ G4_SrcRegRegion* I0S0 = builder.createSrc (nPVar,
10647
+ 0 , 0 , builder.getRegionScalar (), Ty);
10648
+ G4_DstRegRegion* I0D0 = builder.createDst (
10649
+ modDcl->getRegVar (), 0 , 0 , 1 , Ty);
10650
+ G4_INST* I0 = builder.createMov (g4::SIMD1, I0D0, I0S0, InstOpt_WriteEnable, false );
10651
+ G4_Predicate* flag = builder.createPredicate (
10652
+ PredState_Plus, flagVar, 0 , getPredCtrl (useAnyh));
10653
+ I0->setPredicate (flag);
10654
+ currBB->insertBefore (nextII, I0);
10655
+
10656
+ flagVarDefInst->addDefUse (I0, Opnd_pred);
10657
+ I->addDefUse (I0, Opnd_src0);
10658
+ }
10659
+ else
10660
+ { // case 1.2
10661
+ G4_Declare* saveDecl = builder.createTempVar (1 , Ty, Any, " saveTmp" );
10662
+ G4_RegVar* saveVar = saveDecl->getRegVar ();
10663
+ G4_SrcRegRegion* I0S0 = builder.createSrc (
10664
+ modDcl->getRegVar (),
10665
+ 0 , 0 , builder.getRegionScalar (), Ty);
10666
+ G4_DstRegRegion* D0 = builder.createDst (saveVar, 0 , 0 , 1 , Ty);
10667
+ G4_INST* I0 = builder.createMov (g4::SIMD1, D0, I0S0, InstOpt_WriteEnable, false );
10668
+ currBB->insertBefore (currII, I0);
10669
+
10670
+ auto nextII = currII;
10671
+ ++nextII;
10672
+ G4_SrcRegRegion* I1S0 = builder.createSrc (saveVar,
10673
+ 0 , 0 , builder.getRegionScalar (), Ty);
10674
+ G4_DstRegRegion* D1 = builder.createDst (
10675
+ modDcl->getRegVar (), 0 , 0 , 1 , Ty);
10676
+ G4_INST* I1 = builder.createMov (g4::SIMD1, D1, I1S0, InstOpt_WriteEnable, false );
10677
+ G4_Predicate* flag = builder.createPredicate (
10678
+ PredState_Minus, flagVar, 0 , getPredCtrl (useAnyh));
10679
+ I1->setPredicate (flag);
10680
+ currBB->insertBefore (nextII, I1);
10681
+
10682
+ flagVarDefInst->addDefUse (I1, Opnd_pred);
10683
+ I0->addDefUse (I1, Opnd_src0);
10684
+
10685
+ if (!condModGlb)
10686
+ {
10687
+ // Copy condMod uses to I1.
10688
+ I->copyUsesTo (I1, false );
10689
+ }
10656
10690
}
10657
10691
return ;
10658
10692
}
0 commit comments