Skip to content

Commit a74490c

Browse files
jgu222sys_zuul
authored andcommitted
As default uses cmp, not ce, for generating emask, using anyh
could save one inst. This change adds code to do anyh, but off by default. Change-Id: I17dd7dc73b293e0d223d44fb3f661f37abfa9219
1 parent 4289859 commit a74490c

File tree

1 file changed

+69
-13
lines changed

1 file changed

+69
-13
lines changed

visa/Optimizer.cpp

Lines changed: 69 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11874,6 +11874,37 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1187411874
std::vector<INST_LIST_ITER> NoMaskCandidates;
1187511875
uint32_t simdsize = fg.getKernel()->getSimdSize();
1187611876

11877+
// When using cmp to generate emask, the default is to create
11878+
// all-one flag so it applies to all execution size and quarter
11879+
// control combination. Doing so needs 3 instructions for each BB.
11880+
// On the other hand, if anyh can be used, 2 insts would be needed
11881+
// so that we save 1 insts for each BB. The condition that anyh
11882+
// can be used is that M0 is used for all NoMask insts that needs
11883+
// WA and all its execsize is no larger than simdsize.
11884+
bool enableAnyh = false;
11885+
bool useAnyh = false; // default use all-one flag.
11886+
if ((builder.getuint32Option(vISA_noMaskWA) & 0x10) != 0)
11887+
{
11888+
enableAnyh = true;
11889+
}
11890+
if ((builder.getuint32Option(vISA_noMaskWA) & 0x8) == 0 ||
11891+
(builder.getuint32Option(vISA_noMaskWA) & 0x4) == 0)
11892+
{
11893+
// When reading ce or doing WA per inst, do not use anyh
11894+
enableAnyh = false;
11895+
}
11896+
11897+
11898+
auto getPredCtrl = [&](bool isUseAnyh) -> G4_Predicate_Control
11899+
{
11900+
if (isUseAnyh)
11901+
{
11902+
return simdsize == 8 ? PRED_ANY8H
11903+
: (simdsize == 16 ? PRED_ANY16H : PRED_ANY32H);
11904+
}
11905+
return PRED_DEFAULT;
11906+
};
11907+
1187711908
// Return condMod if a flag register is used. Since sel
1187811909
// does not update flag register, return null for sel.
1187911910
auto getFlagModifier = [](G4_INST* I) -> G4_CondMod* {
@@ -11949,6 +11980,12 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1194911980
NULL, G4_cmp, flagCM, false, simdsize, nullDst, r0_0, r0_1, InstOpt_M0);
1195011981
BB->insert(II, I1);
1195111982

11983+
if (useAnyh)
11984+
{
11985+
flagDefInst = I1;
11986+
return flagVar;
11987+
}
11988+
1195211989
G4_Imm* allone = builder.createImm(0xFFFFFFFF, Ty);
1195311990
G4_DstRegRegion* tFlag = builder.createDst(flagVar, 0, 0, 1, Ty);
1195411991
flagDefInst = builder.createMov(1, tFlag, allone, InstOpt_WriteEnable, false);
@@ -12107,7 +12144,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1210712144
}
1210812145
G4_DstRegRegion* tDst = builder.createDst(tPVar, 0, 0, 1, Ty);
1210912146
G4_Predicate* flag0 = builder.createPredicate(
12110-
PredState_Plus, flagVar, 0, PRED_DEFAULT);
12147+
PredState_Plus, flagVar, 0, getPredCtrl(useAnyh));
1211112148
G4_INST* I0 = builder.createInternalInst(
1211212149
flag0, G4_sel, nullptr, false,
1211312150
1, tDst, Src0, Src1, InstOpt_WriteEnable);
@@ -12177,7 +12214,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1217712214
G4_INST* I0 = builder.createMov(
1217812215
I->getExecSize(), dst, tSrc, InstOpt_WriteEnable, false);
1217912216
G4_Predicate* flag0 = builder.createPredicate(
12180-
PredState_Plus, flagVar, 0, PRED_DEFAULT);
12217+
PredState_Plus, flagVar, 0, getPredCtrl(useAnyh));
1218112218
I0->setPredicate(flag0);
1218212219
flag0->setSameAsNoMask(true);
1218312220

@@ -12257,7 +12294,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1225712294
modDcl->getRegVar(), 0, 0, 1, Ty);
1225812295
G4_INST* I1 = builder.createMov(1, D1, I1S0, InstOpt_WriteEnable, false);
1225912296
G4_Predicate* flag = builder.createPredicate(
12260-
PredState_Minus, flagVar, 0, PRED_DEFAULT);
12297+
PredState_Minus, flagVar, 0, getPredCtrl(useAnyh));
1226112298
I1->setPredicate(flag);
1226212299
currBB->insert(nextII, I1);
1226312300

@@ -12284,10 +12321,12 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1228412321
currBB->insert(currII, I0);
1228512322

1228612323
// Use the new flag
12324+
// Note that if useAny is true, nP should use anyh
1228712325
G4_Predicate* nP = builder.createPredicate(
12288-
PredState_Plus, nPVar, 0, PRED_DEFAULT);
12326+
PredState_Plus, nPVar, 0, getPredCtrl(useAnyh));
1228912327
G4_CondMod* nM = builder.createCondMod(P->getMod(), nPVar, 0);
1229012328
I->setPredicate(nP);
12329+
nP->setSameAsNoMask(true);
1229112330
I->setCondMod(nM);
1229212331

1229312332
G4_SrcRegRegion* I1S0 = builder.createSrcRegRegion(
@@ -12296,7 +12335,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1229612335
G4_DstRegRegion* D1 = builder.createDst(
1229712336
modDcl->getRegVar(), 0, 0, 1, Ty);
1229812337
G4_Predicate* flag1 = builder.createPredicate(
12299-
PredState_Plus, flagVar, 0, PRED_DEFAULT);
12338+
PredState_Plus, flagVar, 0, getPredCtrl(useAnyh));
1230012339
G4_INST* I1 = builder.createMov(1, D1, I1S0, InstOpt_WriteEnable, false);
1230112340
I1->setPredicate(flag1);
1230212341
flag1->setSameAsNoMask(true);
@@ -12354,7 +12393,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1235412393
modDcl->getRegVar(), 0, 0, 1, Ty);
1235512394
G4_INST* I2 = builder.createMov(1, D2, I2S0, InstOpt_WriteEnable, false);
1235612395
G4_Predicate* flag2 = builder.createPredicate(
12357-
PredState_Minus, flagVar, 0, PRED_DEFAULT);
12396+
PredState_Minus, flagVar, 0, getPredCtrl(useAnyh));
1235812397
I2->setPredicate(flag2);
1235912398
currBB->insert(nextII, I2);
1236012399

@@ -12422,7 +12461,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1242212461
}
1242312462
G4_INST* I1 = builder.createMov(1, D1, immS0, InstOpt_WriteEnable, false);
1242412463
G4_Predicate* flag1 = builder.createPredicate(
12425-
PredState_Minus, flagVar, 0, PRED_DEFAULT);
12464+
PredState_Minus, flagVar, 0, getPredCtrl(useAnyh));
1242612465
I1->setPredicate(flag1);
1242712466
currBB->insert(currII, I1);
1242812467

@@ -12437,7 +12476,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1243712476
modDcl->getRegVar(), 0, 0, 1, Ty);
1243812477
G4_INST* I2 = builder.createMov(1, D2, I2S0, InstOpt_WriteEnable, false);
1243912478
G4_Predicate* flag2 = builder.createPredicate(
12440-
PredState_Minus, flagVar, 0, PRED_DEFAULT);
12479+
PredState_Minus, flagVar, 0, getPredCtrl(useAnyh));
1244112480
I2->setPredicate(flag2);
1244212481
currBB->insert(nextII, I2);
1244312482

@@ -12488,7 +12527,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1248812527
{
1248912528
// case 1: no predicate, no flagModifier (common case)
1249012529
G4_Predicate* newPred = builder.createPredicate(
12491-
PredState_Plus, flagVar, 0, PRED_DEFAULT);
12530+
PredState_Plus, flagVar, 0, getPredCtrl(useAnyh));
1249212531
newPred->setSameAsNoMask(true);
1249312532
I->setPredicate(newPred);
1249412533

@@ -12523,17 +12562,27 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1252312562
//
1252412563
// BB:
1252512564
// ......
12526-
// (W) and (16|M0) (nz)f0.0 null<1>:ud ce0.0:ud dmask:ud
12527-
// ** If any nomask inst uses bits beyond 16, need the following too **
12528-
// (W) mov (2|M0) f0.0<1>:uw f0.0<0;1,0>:uw
12565+
// if (using ce0)
12566+
// (W) and (16|M0) (nz)f0.0 null<1>:ud ce0.0:ud dmask:ud
12567+
// ** If any nomask inst uses bits beyond 16, need the following too **
12568+
// (W) mov (2|M0) f0.0<1>:uw f0.0<0;1,0>:uw
12569+
// else // using cmp
12570+
// (W) mov (1|M0) f0.0:uw 0:uw // or ud
12571+
// cmp (16|M0) (eq)f0.0 null:uw r0.0<0;1,0::uw r0.0<0;1,0>:uw
12572+
// if (!useAnyh)
12573+
// (W&f0.0.any16h) mov (1|M0) f0.0 0xFFFF:uw
1252912574
//
1253012575
// (W&f0.0) inst0
1253112576
// ......
1253212577
// (W&f0.0) inst1
1253312578
//
12579+
// If useAnyh, the predicate would be (W & f0.0.any16h). The new flag,
12580+
// either f0.0 (all-one) or f0.0.any16h (useAnyh) is equivalent to noMask.
1253412581

1253512582
// 1. Collect all candidates and check if 32 bit flag is needed
12583+
// and if useAnyh can be set to true.
1253612584
bool need32BitFlag = false;
12585+
useAnyh = enableAnyh; // need to reset for each BB
1253712586
for (auto II = BB->begin(), IE = BB->end(); II != IE; ++II)
1253812587
{
1253912588
G4_INST* I = *II;
@@ -12544,6 +12593,13 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1254412593
{
1254512594
need32BitFlag = true;
1254612595
}
12596+
if (enableAnyh)
12597+
{
12598+
if (I->getExecSize() > simdsize || I->getMaskOffset() != 0)
12599+
{
12600+
useAnyh = false;
12601+
}
12602+
}
1254712603
}
1254812604
}
1254912605
if (NoMaskCandidates.empty())
@@ -12570,7 +12626,7 @@ void Optimizer::replaceNoMaskWithAnyhWA()
1257012626
{
1257112627
// case 1: no predicate, no flagModifier (common case)
1257212628
G4_Predicate* newPred = builder.createPredicate(
12573-
PredState_Plus, flagVarForBB, 0, PRED_DEFAULT);
12629+
PredState_Plus, flagVarForBB, 0, getPredCtrl(useAnyh));
1257412630
newPred->setSameAsNoMask(true);
1257512631
I->setPredicate(newPred);
1257612632

0 commit comments

Comments
 (0)