Skip to content

Commit 040b699

Browse files
bcheng0127igcbot
authored andcommitted
mul/mac src1 read suppression WA
Two issues, please check the attached document in the related document. When the mul/mach have same src1 but different region pattern with the src1 of previous instruction, a dummy instruction is need to be inserted to solve the potential read suppression bug.
1 parent 7f23c84 commit 040b699

File tree

7 files changed

+215
-0
lines changed

7 files changed

+215
-0
lines changed

inc/common/sku_wa_defs.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -637,4 +637,10 @@ SPDX-License-Identifier: MIT
637637
Wa_14020375314,
638638
"Workaround",
639639
WA_BUG_TYPE_UNKNOWN,
640+
WA_BUG_PERF_IMPACT_UNKNOWN, WA_COMPONENT_UNKNOWN)
641+
642+
WA_DECLARE(
643+
Wa_18035690555,
644+
"Workaround",
645+
WA_BUG_TYPE_UNKNOWN,
640646
WA_BUG_PERF_IMPACT_UNKNOWN, WA_COMPONENT_UNKNOWN)

visa/G4_IR.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4780,6 +4780,16 @@ static bool regionHasFixedSubreg(const IR_Builder &builder, G4_Operand *opnd,
47804780

47814781
G4_VarBase *base = opnd->getBase();
47824782

4783+
if (base->isAccReg()) {
4784+
if (opnd->isDstRegRegion()) {
4785+
offset = opnd->asDstRegRegion()->getSubRegOff();
4786+
}
4787+
if (opnd->isSrcRegRegion()) {
4788+
offset = opnd->asSrcRegRegion()->getSubRegOff();
4789+
}
4790+
return true;
4791+
}
4792+
47834793
if (base == NULL || !base->isRegVar() ||
47844794
!base->asRegVar()->getDeclare()->useGRF()) {
47854795
return false;
@@ -6086,6 +6096,44 @@ bool G4_SrcRegRegion::checkGRFAlign(const IR_Builder &builder) {
60866096
return GRF_aligned;
60876097
}
60886098

6099+
// Flat reg region: starting bit position of a channel in a register is not
6100+
// shifted between src and dst, or limited shift of bit position e.g. big
6101+
// int mul/mad upper dword src's or when dst-type is smaller than exec-type
6102+
// and the dst is written to upper parts of a exec channel.
6103+
// Return true if src region is flat region
6104+
bool G4_SrcRegRegion::isFlatRegRegion(
6105+
uint8_t exChannelWidth,
6106+
std::function<bool(uint8_t dstStrideInBytes, uint8_t dstSubRegOffInBytes,
6107+
uint8_t srcStrideInBytes, uint8_t srcSubRegOffInBytes,
6108+
uint8_t exChannelWidth)> checkFlatRegRegionFunc) {
6109+
auto dst = inst->getDst();
6110+
uint32_t dstSubRegOff = 0, srcSubRegOff = 0;
6111+
bool dstHasFixedSubregOffset = false;
6112+
if (dst->isNullReg())
6113+
dstHasFixedSubregOffset = true;
6114+
else
6115+
dstHasFixedSubregOffset =
6116+
dst->hasFixedSubregOffset(inst->getBuilder(), dstSubRegOff);
6117+
6118+
bool srcHasFixedSubregOffset = false;
6119+
if (isNullReg())
6120+
srcHasFixedSubregOffset = true;
6121+
else
6122+
srcHasFixedSubregOffset =
6123+
hasFixedSubregOffset(inst->getBuilder(), srcSubRegOff);
6124+
6125+
if (dstHasFixedSubregOffset && srcHasFixedSubregOffset) {
6126+
uint8_t dstStrideInBytes = (uint8_t)dst->getExecTypeSize();
6127+
uint16_t srcStride = 0;
6128+
getRegion()->isSingleStride(inst->getExecSize(), srcStride);
6129+
uint8_t srcStrideInBytes = (uint8_t)(srcStride * getTypeSize());
6130+
return checkFlatRegRegionFunc(dstStrideInBytes, (uint8_t)dstSubRegOff,
6131+
srcStrideInBytes,
6132+
(uint8_t)srcSubRegOff, exChannelWidth);
6133+
}
6134+
return false;
6135+
}
6136+
60896137
//
60906138
// returns true if this SrcRegRegion has a fixed subreg offset (in bytes).
60916139
// This is true only if

visa/G4_Operand.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,11 @@ class G4_SrcRegRegion final : public G4_Operand {
619619
bool isDirectA0() const { return acc == Direct && base->isA0(); }
620620
bool isDirectAddress() const { return acc == Direct && base->isAddress(); }
621621
bool isScalar() const;
622+
bool isFlatRegRegion(
623+
uint8_t exChannelWidth,
624+
std::function<bool(uint8_t dstStrideInBytes, uint8_t dstSubRegOffInBytes,
625+
uint8_t srcStrideInBytes, uint8_t srcSubRegOffInBytes,
626+
uint8_t exChannelWidth)> checkFlatRegRegionFunc);
622627

623628
unsigned short ExRegNum(bool &) const;
624629
unsigned short ExSubRegNum(bool &);

visa/HWCaps.inc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -873,4 +873,10 @@ bool alwaysAllowGlobalFlagOpt() const {
873873
// behavior for now.
874874
return getPlatform() <= GENX_TGLLP;
875875
}
876+
bool hasMulMacRSIssue() const {
877+
if (getOption(vISA_hasMulMacRSIssue))
878+
return true;
879+
880+
return VISA_WA_CHECK(getPWaTable(), Wa_18035690555);
881+
}
876882
// end HW capabilities

visa/Optimizer.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ class Optimizer {
239239
void linePlaneWA(G4_INST *inst);
240240
void fixSendSrcRegion(G4_INST *inst);
241241
void clearARFDependencies();
242+
void mulMacRSWA();
242243
void clearSendDependencies();
243244
void loadThreadPayload();
244245
void addFFIDProlog();
@@ -284,6 +285,8 @@ class Optimizer {
284285
// In both case, the dst of inst_with_ip is used to save IP.
285286
void replaceIPWithCall(InstListType &insts, G4_INST *inst_with_ip);
286287

288+
void insertDummyAdd(G4_BB *bb, INST_LIST_ITER inst_it, int imm = 0);
289+
287290
void insertDummyMad(G4_BB *bb, INST_LIST_ITER inst_it);
288291

289292
void insertDummyCsel(G4_BB *bb, INST_LIST_ITER inst_it, bool newBB);

visa/SWWA.cpp

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,19 @@ void Optimizer::insertDummyCompactInst() {
5353
bb->push_back(movInst);
5454
}
5555

56+
// add (1|M0) null<1>:uw null<0;1,0>:uw 0x0:uw
57+
void Optimizer::insertDummyAdd(G4_BB *bb, INST_LIST_ITER inst_it, int imm) {
58+
// Dst
59+
auto nullDst = builder.createNullDst(Type_UW);
60+
auto nullSrc0 = builder.createNullSrc(Type_UW);
61+
auto immSrc1 = builder.createImm(imm, Type_UW);
62+
63+
auto addInst = builder.createBinOp(G4_add, g4::SIMD1, nullDst, nullSrc0,
64+
immSrc1, InstOpt_WriteEnable, false);
65+
66+
bb->insertBefore(inst_it, addInst);
67+
}
68+
5669
// Float and DP share same GRF cache.
5770
// Integer and Math shader same GRF cache.
5871
void Optimizer::insertDummyMad(G4_BB *bb, INST_LIST_ITER inst_it) {
@@ -3504,6 +3517,135 @@ void Optimizer::clearARFDependencies() {
35043517
}
35053518
}
35063519
}
3520+
void Optimizer::mulMacRSWA() {
3521+
auto hasGRFOverlap = [=](G4_Operand *A, G4_Operand *B) {
3522+
if (A->isNullReg() || !A->isGreg())
3523+
return false;
3524+
if (B->isNullReg() || !B->isGreg())
3525+
return false;
3526+
3527+
unsigned LB1 =
3528+
A->getLinearizedStart() / fg.builder->numEltPerGRF<Type_UB>();
3529+
unsigned RB1 = A->getLinearizedEnd() / fg.builder->numEltPerGRF<Type_UB>();
3530+
unsigned LB2 =
3531+
B->getLinearizedStart() / fg.builder->numEltPerGRF<Type_UB>();
3532+
unsigned RB2 = B->getLinearizedEnd() / fg.builder->numEltPerGRF<Type_UB>();
3533+
3534+
return (RB2 >= LB1 && RB1 >= LB2);
3535+
};
3536+
3537+
auto isBothMulClass = [](G4_INST *inst1, G4_INST *inst2) {
3538+
return (inst1->opcode() == G4_mul || inst1->opcode() == G4_mac) &&
3539+
(inst2->opcode() == G4_mul || inst2->opcode() == G4_mac);
3540+
};
3541+
3542+
auto isBothMaclClass = [](G4_INST *inst1, G4_INST *inst2) {
3543+
// In vISA, only G4_mach will be used. IGA will change it G4_macl according
3544+
// to certain conditions.
3545+
return (inst1->opcode() == G4_mach) &&
3546+
(inst2->opcode() == G4_mach);
3547+
};
3548+
3549+
auto checkFlatRegRegionFunc =
3550+
[](uint8_t dstStrideInBytes, uint8_t dstSubRegOffInBytes,
3551+
uint8_t srcStrideInBytes, uint8_t srcSubRegOffInBytes,
3552+
uint8_t exChannelWidth) -> bool {
3553+
return ((dstSubRegOffInBytes == srcSubRegOffInBytes) &&
3554+
(dstStrideInBytes == srcStrideInBytes) &&
3555+
(dstStrideInBytes % exChannelWidth == 0));
3556+
};
3557+
3558+
for (auto bb : fg) {
3559+
G4_INST *prevInst = nullptr;
3560+
INST_LIST_ITER ii = bb->begin();
3561+
3562+
while (ii != bb->end()) {
3563+
G4_INST *inst = *ii;
3564+
3565+
if (inst->getNumSrc() != 2 || inst->tokenHonourInstruction() ||
3566+
inst->isIntrinsic()) {
3567+
prevInst = nullptr;
3568+
ii++;
3569+
continue;
3570+
}
3571+
3572+
if (!prevInst) {
3573+
prevInst = inst;
3574+
ii++;
3575+
continue;
3576+
}
3577+
3578+
uint8_t exChannelWidth = (uint8_t)TypeSize(inst->getExecType());
3579+
3580+
// Issue 1:
3581+
// MUL opcode class = {MUL, MAC}
3582+
// MACL opcode class = {MACL, MACH}
3583+
//
3584+
// Issue is present for MUL opcode class OR MACL opcode class (both
3585+
// prev/current instruction should belong to the same opcode class)
3586+
// 1. prev instructions src1 has REGIONING/SCALAR
3587+
// 2. current instruction src1 is FLAT and shares the same src1 as prev
3588+
//
3589+
// instruction Issue is not present for below cases.
3590+
// 1. prev instruction is FLAT and current instruction has
3591+
// REGIONING/SCALAR
3592+
// 2. prev/current both are FLAT
3593+
// 3. prev/current both has REGIONING/SCALAR
3594+
// 4. One instruction is in MUL opcode class and the other instruction
3595+
// is in MACL opcode class
3596+
if (isBothMulClass(prevInst, inst) || isBothMaclClass(prevInst, inst)) {
3597+
G4_Operand *prevSrc1 = prevInst->getSrc(1);
3598+
G4_Operand *curSrc1 = inst->getSrc(1);
3599+
3600+
if (prevSrc1->isGreg() &&
3601+
prevSrc1->isSrcRegRegion() && curSrc1 != nullptr &&
3602+
curSrc1->isGreg() && curSrc1->isSrcRegRegion()) { // All regions
3603+
3604+
if (!prevSrc1->asSrcRegRegion()->isFlatRegRegion(
3605+
exChannelWidth, checkFlatRegRegionFunc) &&
3606+
curSrc1->asSrcRegRegion()->isFlatRegRegion(
3607+
exChannelWidth, checkFlatRegRegionFunc) &&
3608+
hasGRFOverlap(
3609+
prevSrc1,
3610+
curSrc1)) { // none flat vs flat regions, and overlap
3611+
// WorkAround: Insert dummy instruction that can break src1 RS
3612+
// chain between regioning MUL instruction and FLAT MULK
3613+
// instruction (IMMEDIATE operand can be used for src1 to break
3614+
// the RS chain)
3615+
insertDummyAdd(bb, ii);
3616+
}
3617+
}
3618+
}
3619+
3620+
// Issue 2
3621+
// prev.instruction is non-MUL opcode class instruction AND non-MACL
3622+
// opcode class instruction has(FLAT or Regioning / Scalar) src1 and
3623+
// current Instruction is MACL opcode class
3624+
// instruction AND has FLAT regioning AND shares the same src1 has the
3625+
// prev.instruction,
3626+
if (inst->opcode() == G4_mach) {
3627+
G4_Operand *prevSrc1 = prevInst->getSrc(1);
3628+
G4_Operand *curSrc1 = inst->getSrc(1);
3629+
3630+
if (prevSrc1->isGreg() &&
3631+
prevSrc1->isSrcRegRegion() && curSrc1 != nullptr &&
3632+
curSrc1->isGreg() && curSrc1->isSrcRegRegion()) {
3633+
if (prevInst->opcode() != G4_mach && prevInst->opcode() != G4_mul &&
3634+
prevInst->opcode() != G4_mac) {
3635+
if (curSrc1->asSrcRegRegion()->isFlatRegRegion(
3636+
exChannelWidth, checkFlatRegRegionFunc) &&
3637+
hasGRFOverlap(prevSrc1, curSrc1)) {
3638+
insertDummyAdd(bb, ii, 1);
3639+
}
3640+
}
3641+
}
3642+
}
3643+
3644+
prevInst = inst;
3645+
ii++;
3646+
}
3647+
}
3648+
}
35073649

35083650
// change the send src0 region to be consistent with assembler expectation
35093651
// We do it here instead of HW conformity since they only affect binary encoding
@@ -3778,6 +3920,10 @@ void Optimizer::HWWorkaround() {
37783920
clearSendDependencies();
37793921
}
37803922

3923+
if (builder.hasMulMacRSIssue()) {
3924+
mulMacRSWA();
3925+
}
3926+
37813927
if (builder.needResetA0forVxHA0()) {
37823928
// reset a0 to 0 at the beginning of a shader.
37833929
// The goal of this initialization is to make sure that there is no

visa/include/VISAOptionsDefs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,3 +757,4 @@ DEF_VISA_OPTION(vISA_EnableProgrammableOffsetsMessageBitInHeader, ET_BOOL,
757757
NULLSTR, UNUSED, false)
758758
DEF_VISA_OPTION(vISA_staticProfiling, ET_BOOL, "-staticProfiling", UNUSED, true)
759759
DEF_VISA_OPTION(vISA_staticBBProfiling, ET_BOOL, "-staticBBProfiling", UNUSED, false)
760+
DEF_VISA_OPTION(vISA_hasMulMacRSIssue, ET_BOOL, "-hasMulMacRSIssue", UNUSED, false)

0 commit comments

Comments
 (0)