@@ -11874,6 +11874,37 @@ void Optimizer::replaceNoMaskWithAnyhWA()
11874
11874
std::vector<INST_LIST_ITER> NoMaskCandidates;
11875
11875
uint32_t simdsize = fg.getKernel ()->getSimdSize ();
11876
11876
11877
+ // When using cmp to generate emask, the default is to create
11878
+ // all-one flag so it applies to all execution size and quarter
11879
+ // control combination. Doing so needs 3 instructions for each BB.
11880
+ // On the other hand, if anyh can be used, 2 insts would be needed
11881
+ // so that we save 1 insts for each BB. The condition that anyh
11882
+ // can be used is that M0 is used for all NoMask insts that needs
11883
+ // WA and all its execsize is no larger than simdsize.
11884
+ bool enableAnyh = false ;
11885
+ bool useAnyh = false ; // default use all-one flag.
11886
+ if ((builder.getuint32Option (vISA_noMaskWA) & 0x10 ) != 0 )
11887
+ {
11888
+ enableAnyh = true ;
11889
+ }
11890
+ if ((builder.getuint32Option (vISA_noMaskWA) & 0x8 ) == 0 ||
11891
+ (builder.getuint32Option (vISA_noMaskWA) & 0x4 ) == 0 )
11892
+ {
11893
+ // When reading ce or doing WA per inst, do not use anyh
11894
+ enableAnyh = false ;
11895
+ }
11896
+
11897
+
11898
+ auto getPredCtrl = [&](bool isUseAnyh) -> G4_Predicate_Control
11899
+ {
11900
+ if (isUseAnyh)
11901
+ {
11902
+ return simdsize == 8 ? PRED_ANY8H
11903
+ : (simdsize == 16 ? PRED_ANY16H : PRED_ANY32H);
11904
+ }
11905
+ return PRED_DEFAULT;
11906
+ };
11907
+
11877
11908
// Return condMod if a flag register is used. Since sel
11878
11909
// does not update flag register, return null for sel.
11879
11910
auto getFlagModifier = [](G4_INST* I) -> G4_CondMod* {
@@ -11949,6 +11980,12 @@ void Optimizer::replaceNoMaskWithAnyhWA()
11949
11980
NULL , G4_cmp, flagCM, false , simdsize, nullDst, r0_0, r0_1, InstOpt_M0);
11950
11981
BB->insert (II, I1);
11951
11982
11983
+ if (useAnyh)
11984
+ {
11985
+ flagDefInst = I1;
11986
+ return flagVar;
11987
+ }
11988
+
11952
11989
G4_Imm* allone = builder.createImm (0xFFFFFFFF , Ty);
11953
11990
G4_DstRegRegion* tFlag = builder.createDst (flagVar, 0 , 0 , 1 , Ty);
11954
11991
flagDefInst = builder.createMov (1 , tFlag, allone, InstOpt_WriteEnable, false );
@@ -12107,7 +12144,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12107
12144
}
12108
12145
G4_DstRegRegion* tDst = builder.createDst (tPVar, 0 , 0 , 1 , Ty);
12109
12146
G4_Predicate* flag0 = builder.createPredicate (
12110
- PredState_Plus, flagVar, 0 , PRED_DEFAULT );
12147
+ PredState_Plus, flagVar, 0 , getPredCtrl (useAnyh) );
12111
12148
G4_INST* I0 = builder.createInternalInst (
12112
12149
flag0, G4_sel, nullptr , false ,
12113
12150
1 , tDst, Src0, Src1, InstOpt_WriteEnable);
@@ -12177,7 +12214,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12177
12214
G4_INST* I0 = builder.createMov (
12178
12215
I->getExecSize (), dst, tSrc, InstOpt_WriteEnable, false );
12179
12216
G4_Predicate* flag0 = builder.createPredicate (
12180
- PredState_Plus, flagVar, 0 , PRED_DEFAULT );
12217
+ PredState_Plus, flagVar, 0 , getPredCtrl (useAnyh) );
12181
12218
I0->setPredicate (flag0);
12182
12219
flag0->setSameAsNoMask (true );
12183
12220
@@ -12257,7 +12294,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12257
12294
modDcl->getRegVar (), 0 , 0 , 1 , Ty);
12258
12295
G4_INST* I1 = builder.createMov (1 , D1, I1S0, InstOpt_WriteEnable, false );
12259
12296
G4_Predicate* flag = builder.createPredicate (
12260
- PredState_Minus, flagVar, 0 , PRED_DEFAULT );
12297
+ PredState_Minus, flagVar, 0 , getPredCtrl (useAnyh) );
12261
12298
I1->setPredicate (flag);
12262
12299
currBB->insert (nextII, I1);
12263
12300
@@ -12284,10 +12321,12 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12284
12321
currBB->insert (currII, I0);
12285
12322
12286
12323
// Use the new flag
12324
+ // Note that if useAny is true, nP should use anyh
12287
12325
G4_Predicate* nP = builder.createPredicate (
12288
- PredState_Plus, nPVar, 0 , PRED_DEFAULT );
12326
+ PredState_Plus, nPVar, 0 , getPredCtrl (useAnyh) );
12289
12327
G4_CondMod* nM = builder.createCondMod (P->getMod (), nPVar, 0 );
12290
12328
I->setPredicate (nP);
12329
+ nP->setSameAsNoMask (true );
12291
12330
I->setCondMod (nM);
12292
12331
12293
12332
G4_SrcRegRegion* I1S0 = builder.createSrcRegRegion (
@@ -12296,7 +12335,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12296
12335
G4_DstRegRegion* D1 = builder.createDst (
12297
12336
modDcl->getRegVar (), 0 , 0 , 1 , Ty);
12298
12337
G4_Predicate* flag1 = builder.createPredicate (
12299
- PredState_Plus, flagVar, 0 , PRED_DEFAULT );
12338
+ PredState_Plus, flagVar, 0 , getPredCtrl (useAnyh) );
12300
12339
G4_INST* I1 = builder.createMov (1 , D1, I1S0, InstOpt_WriteEnable, false );
12301
12340
I1->setPredicate (flag1);
12302
12341
flag1->setSameAsNoMask (true );
@@ -12354,7 +12393,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12354
12393
modDcl->getRegVar (), 0 , 0 , 1 , Ty);
12355
12394
G4_INST* I2 = builder.createMov (1 , D2, I2S0, InstOpt_WriteEnable, false );
12356
12395
G4_Predicate* flag2 = builder.createPredicate (
12357
- PredState_Minus, flagVar, 0 , PRED_DEFAULT );
12396
+ PredState_Minus, flagVar, 0 , getPredCtrl (useAnyh) );
12358
12397
I2->setPredicate (flag2);
12359
12398
currBB->insert (nextII, I2);
12360
12399
@@ -12422,7 +12461,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12422
12461
}
12423
12462
G4_INST* I1 = builder.createMov (1 , D1, immS0, InstOpt_WriteEnable, false );
12424
12463
G4_Predicate* flag1 = builder.createPredicate (
12425
- PredState_Minus, flagVar, 0 , PRED_DEFAULT );
12464
+ PredState_Minus, flagVar, 0 , getPredCtrl (useAnyh) );
12426
12465
I1->setPredicate (flag1);
12427
12466
currBB->insert (currII, I1);
12428
12467
@@ -12437,7 +12476,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12437
12476
modDcl->getRegVar (), 0 , 0 , 1 , Ty);
12438
12477
G4_INST* I2 = builder.createMov (1 , D2, I2S0, InstOpt_WriteEnable, false );
12439
12478
G4_Predicate* flag2 = builder.createPredicate (
12440
- PredState_Minus, flagVar, 0 , PRED_DEFAULT );
12479
+ PredState_Minus, flagVar, 0 , getPredCtrl (useAnyh) );
12441
12480
I2->setPredicate (flag2);
12442
12481
currBB->insert (nextII, I2);
12443
12482
@@ -12488,7 +12527,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12488
12527
{
12489
12528
// case 1: no predicate, no flagModifier (common case)
12490
12529
G4_Predicate* newPred = builder.createPredicate (
12491
- PredState_Plus, flagVar, 0 , PRED_DEFAULT );
12530
+ PredState_Plus, flagVar, 0 , getPredCtrl (useAnyh) );
12492
12531
newPred->setSameAsNoMask (true );
12493
12532
I->setPredicate (newPred);
12494
12533
@@ -12523,17 +12562,27 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12523
12562
//
12524
12563
// BB:
12525
12564
// ......
12526
- // (W) and (16|M0) (nz)f0.0 null<1>:ud ce0.0:ud dmask:ud
12527
- // ** If any nomask inst uses bits beyond 16, need the following too **
12528
- // (W) mov (2|M0) f0.0<1>:uw f0.0<0;1,0>:uw
12565
+ // if (using ce0)
12566
+ // (W) and (16|M0) (nz)f0.0 null<1>:ud ce0.0:ud dmask:ud
12567
+ // ** If any nomask inst uses bits beyond 16, need the following too **
12568
+ // (W) mov (2|M0) f0.0<1>:uw f0.0<0;1,0>:uw
12569
+ // else // using cmp
12570
+ // (W) mov (1|M0) f0.0:uw 0:uw // or ud
12571
+ // cmp (16|M0) (eq)f0.0 null:uw r0.0<0;1,0::uw r0.0<0;1,0>:uw
12572
+ // if (!useAnyh)
12573
+ // (W&f0.0.any16h) mov (1|M0) f0.0 0xFFFF:uw
12529
12574
//
12530
12575
// (W&f0.0) inst0
12531
12576
// ......
12532
12577
// (W&f0.0) inst1
12533
12578
//
12579
+ // If useAnyh, the predicate would be (W & f0.0.any16h). The new flag,
12580
+ // either f0.0 (all-one) or f0.0.any16h (useAnyh) is equivalent to noMask.
12534
12581
12535
12582
// 1. Collect all candidates and check if 32 bit flag is needed
12583
+ // and if useAnyh can be set to true.
12536
12584
bool need32BitFlag = false ;
12585
+ useAnyh = enableAnyh; // need to reset for each BB
12537
12586
for (auto II = BB->begin (), IE = BB->end (); II != IE; ++II)
12538
12587
{
12539
12588
G4_INST* I = *II;
@@ -12544,6 +12593,13 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12544
12593
{
12545
12594
need32BitFlag = true ;
12546
12595
}
12596
+ if (enableAnyh)
12597
+ {
12598
+ if (I->getExecSize () > simdsize || I->getMaskOffset () != 0 )
12599
+ {
12600
+ useAnyh = false ;
12601
+ }
12602
+ }
12547
12603
}
12548
12604
}
12549
12605
if (NoMaskCandidates.empty ())
@@ -12570,7 +12626,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
12570
12626
{
12571
12627
// case 1: no predicate, no flagModifier (common case)
12572
12628
G4_Predicate* newPred = builder.createPredicate (
12573
- PredState_Plus, flagVarForBB, 0 , PRED_DEFAULT );
12629
+ PredState_Plus, flagVarForBB, 0 , getPredCtrl (useAnyh) );
12574
12630
newPred->setSameAsNoMask (true );
12575
12631
I->setPredicate (newPred);
12576
12632
0 commit comments