@@ -53,6 +53,19 @@ void Optimizer::insertDummyCompactInst() {
53
53
bb->push_back (movInst);
54
54
}
55
55
56
+ // add (1|M0) null<1>:uw null<0;1,0>:uw 0x0:uw
57
+ void Optimizer::insertDummyAdd (G4_BB *bb, INST_LIST_ITER inst_it, int imm) {
58
+ // Dst
59
+ auto nullDst = builder.createNullDst (Type_UW);
60
+ auto nullSrc0 = builder.createNullSrc (Type_UW);
61
+ auto immSrc1 = builder.createImm (imm, Type_UW);
62
+
63
+ auto addInst = builder.createBinOp (G4_add, g4::SIMD1, nullDst, nullSrc0,
64
+ immSrc1, InstOpt_WriteEnable, false );
65
+
66
+ bb->insertBefore (inst_it, addInst);
67
+ }
68
+
56
69
// Float and DP share same GRF cache.
57
70
// Integer and Math shader same GRF cache.
58
71
void Optimizer::insertDummyMad (G4_BB *bb, INST_LIST_ITER inst_it) {
@@ -3504,6 +3517,135 @@ void Optimizer::clearARFDependencies() {
3504
3517
}
3505
3518
}
3506
3519
}
3520
+ void Optimizer::mulMacRSWA () {
3521
+ auto hasGRFOverlap = [=](G4_Operand *A, G4_Operand *B) {
3522
+ if (A->isNullReg () || !A->isGreg ())
3523
+ return false ;
3524
+ if (B->isNullReg () || !B->isGreg ())
3525
+ return false ;
3526
+
3527
+ unsigned LB1 =
3528
+ A->getLinearizedStart () / fg.builder ->numEltPerGRF <Type_UB>();
3529
+ unsigned RB1 = A->getLinearizedEnd () / fg.builder ->numEltPerGRF <Type_UB>();
3530
+ unsigned LB2 =
3531
+ B->getLinearizedStart () / fg.builder ->numEltPerGRF <Type_UB>();
3532
+ unsigned RB2 = B->getLinearizedEnd () / fg.builder ->numEltPerGRF <Type_UB>();
3533
+
3534
+ return (RB2 >= LB1 && RB1 >= LB2);
3535
+ };
3536
+
3537
+ auto isBothMulClass = [](G4_INST *inst1, G4_INST *inst2) {
3538
+ return (inst1->opcode () == G4_mul || inst1->opcode () == G4_mac) &&
3539
+ (inst2->opcode () == G4_mul || inst2->opcode () == G4_mac);
3540
+ };
3541
+
3542
+ auto isBothMaclClass = [](G4_INST *inst1, G4_INST *inst2) {
3543
+ // In vISA, only G4_mach will be used. IGA will change it G4_macl according
3544
+ // to certain conditions.
3545
+ return (inst1->opcode () == G4_mach) &&
3546
+ (inst2->opcode () == G4_mach);
3547
+ };
3548
+
3549
+ auto checkFlatRegRegionFunc =
3550
+ [](uint8_t dstStrideInBytes, uint8_t dstSubRegOffInBytes,
3551
+ uint8_t srcStrideInBytes, uint8_t srcSubRegOffInBytes,
3552
+ uint8_t exChannelWidth) -> bool {
3553
+ return ((dstSubRegOffInBytes == srcSubRegOffInBytes) &&
3554
+ (dstStrideInBytes == srcStrideInBytes) &&
3555
+ (dstStrideInBytes % exChannelWidth == 0 ));
3556
+ };
3557
+
3558
+ for (auto bb : fg) {
3559
+ G4_INST *prevInst = nullptr ;
3560
+ INST_LIST_ITER ii = bb->begin ();
3561
+
3562
+ while (ii != bb->end ()) {
3563
+ G4_INST *inst = *ii;
3564
+
3565
+ if (inst->getNumSrc () != 2 || inst->tokenHonourInstruction () ||
3566
+ inst->isIntrinsic ()) {
3567
+ prevInst = nullptr ;
3568
+ ii++;
3569
+ continue ;
3570
+ }
3571
+
3572
+ if (!prevInst) {
3573
+ prevInst = inst;
3574
+ ii++;
3575
+ continue ;
3576
+ }
3577
+
3578
+ uint8_t exChannelWidth = (uint8_t )TypeSize (inst->getExecType ());
3579
+
3580
+ // Issue 1:
3581
+ // MUL opcode class = {MUL, MAC}
3582
+ // MACL opcode class = {MACL, MACH}
3583
+ //
3584
+ // Issue is present for MUL opcode class OR MACL opcode class (both
3585
+ // prev/current instruction should belong to the same opcode class)
3586
+ // 1. prev instructions src1 has REGIONING/SCALAR
3587
+ // 2. current instruction src1 is FLAT and shares the same src1 as prev
3588
+ //
3589
+ // instruction Issue is not present for below cases.
3590
+ // 1. prev instruction is FLAT and current instruction has
3591
+ // REGIONING/SCALAR
3592
+ // 2. prev/current both are FLAT
3593
+ // 3. prev/current both has REGIONING/SCALAR
3594
+ // 4. One instruction is in MUL opcode class and the other instruction
3595
+ // is in MACL opcode class
3596
+ if (isBothMulClass (prevInst, inst) || isBothMaclClass (prevInst, inst)) {
3597
+ G4_Operand *prevSrc1 = prevInst->getSrc (1 );
3598
+ G4_Operand *curSrc1 = inst->getSrc (1 );
3599
+
3600
+ if (prevSrc1->isGreg () &&
3601
+ prevSrc1->isSrcRegRegion () && curSrc1 != nullptr &&
3602
+ curSrc1->isGreg () && curSrc1->isSrcRegRegion ()) { // All regions
3603
+
3604
+ if (!prevSrc1->asSrcRegRegion ()->isFlatRegRegion (
3605
+ exChannelWidth, checkFlatRegRegionFunc) &&
3606
+ curSrc1->asSrcRegRegion ()->isFlatRegRegion (
3607
+ exChannelWidth, checkFlatRegRegionFunc) &&
3608
+ hasGRFOverlap (
3609
+ prevSrc1,
3610
+ curSrc1)) { // none flat vs flat regions, and overlap
3611
+ // WorkAround: Insert dummy instruction that can break src1 RS
3612
+ // chain between regioning MUL instruction and FLAT MULK
3613
+ // instruction (IMMEDIATE operand can be used for src1 to break
3614
+ // the RS chain)
3615
+ insertDummyAdd (bb, ii);
3616
+ }
3617
+ }
3618
+ }
3619
+
3620
+ // Issue 2
3621
+ // prev.instruction is non-MUL opcode class instruction AND non-MACL
3622
+ // opcode class instruction has(FLAT or Regioning / Scalar) src1 and
3623
+ // current Instruction is MACL opcode class
3624
+ // instruction AND has FLAT regioning AND shares the same src1 has the
3625
+ // prev.instruction,
3626
+ if (inst->opcode () == G4_mach) {
3627
+ G4_Operand *prevSrc1 = prevInst->getSrc (1 );
3628
+ G4_Operand *curSrc1 = inst->getSrc (1 );
3629
+
3630
+ if (prevSrc1->isGreg () &&
3631
+ prevSrc1->isSrcRegRegion () && curSrc1 != nullptr &&
3632
+ curSrc1->isGreg () && curSrc1->isSrcRegRegion ()) {
3633
+ if (prevInst->opcode () != G4_mach && prevInst->opcode () != G4_mul &&
3634
+ prevInst->opcode () != G4_mac) {
3635
+ if (curSrc1->asSrcRegRegion ()->isFlatRegRegion (
3636
+ exChannelWidth, checkFlatRegRegionFunc) &&
3637
+ hasGRFOverlap (prevSrc1, curSrc1)) {
3638
+ insertDummyAdd (bb, ii, 1 );
3639
+ }
3640
+ }
3641
+ }
3642
+ }
3643
+
3644
+ prevInst = inst;
3645
+ ii++;
3646
+ }
3647
+ }
3648
+ }
3507
3649
3508
3650
// change the send src0 region to be consistent with assembler expectation
3509
3651
// We do it here instead of HW conformity since they only affect binary encoding
@@ -3778,6 +3920,10 @@ void Optimizer::HWWorkaround() {
3778
3920
clearSendDependencies ();
3779
3921
}
3780
3922
3923
+ if (builder.hasMulMacRSIssue ()) {
3924
+ mulMacRSWA ();
3925
+ }
3926
+
3781
3927
if (builder.needResetA0forVxHA0 ()) {
3782
3928
// reset a0 to 0 at the beginning of a shader.
3783
3929
// The goal of this initialization is to make sure that there is no
0 commit comments