@@ -5509,7 +5509,9 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst)
5509
5509
else
5510
5510
{
5511
5511
m_encoder->Add(pTempVar0, pTempVar0, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD));
5512
- dstSubReg = dstSubReg + scale * blockHeight;
5512
+ uint32_t subOffset = maxWidth * scale * blockHeight;
5513
+ subOffset /= getGRFSize();
5514
+ dstSubReg = dstSubReg + subOffset;
5513
5515
}
5514
5516
m_encoder->Push();
5515
5517
@@ -5548,6 +5550,8 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst)
5548
5550
{
5549
5551
dstSubReg = 0;
5550
5552
5553
+ uint32_t srcSubReg = 0;
5554
+
5551
5555
// Join data obtained from pass 0 and pass 1 to make
5552
5556
// xOffset contiguous from 0 to 63 bytes (making SIMD 16)
5553
5557
// mov (8) r20.0<1>:ud r28.0<8;8,1>:ud {Align1, Q1}
@@ -5559,16 +5563,63 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst)
5559
5563
// mov (8) r26.0<1>:ud r31.0<8;8,1>:ud {Align1, Q1}
5560
5564
// mov (8) r27.0<1>:ud r35.0<8;8,1>:ud {Align1, Q2}
5561
5565
5562
- for (uint32_t i = 0; i < blockHeight; i++)
5563
- {
5564
- for (uint32_t pass = 0; pass < numPasses; pass++)
5566
+
5567
+ //For 64 bytes GRF, 32 bytes will be extended to
5568
+ //.....
5569
+ // A0....A1
5570
+ // B0....B1
5571
+ // C0....C1
5572
+ // D0....D1
5573
+ // E0....E1
5574
+ // F0....F1
5575
+ // G0....G1
5576
+ // H0....H1
5577
+ //
5578
+ // r20....A0....B0........r30....A1....B1
5579
+ // r21....C0....D0........r31....C1....D1
5580
+ // r22....E0....F0........r32....E1....F1
5581
+ // r23....G0....H0........r33....G1....H1
5582
+ //
5583
+ // r40<--r20,....r30
5584
+ // r41<--r20.8,r30.8
5585
+ // r42<--r21,....r31
5586
+ // r43<--r21.8,r31.8
5587
+ // r44<--r22,....r32
5588
+ // r45<--r22.8,r32.8
5589
+ // r46<--r23,....r33
5590
+ // r47<--r23.8,r33.8
5591
+ //
5592
+ //mov (8) r40.0<1>:ud r20.0<8;8,1>:ud {Align1, Q1}
5593
+ //mov (8) r40.8<1>:ud r30.0<8;8,1>:ud {Align1, Q1}
5594
+ //mov (8) r41<1>:ud r20.8<8;8,1>:ud {Align1, Q1}
5595
+ //mov (8) r41.8<1>:ud r30.8<8;8,1>:ud {Align1, Q1}
5596
+
5597
+ for (uint32_t i = 0; i < blockHeight; i++) //Height
5598
+ {
5599
+ uint32_t dstSubRegOffset = 0;
5600
+ uint32_t srcSubRegOffset = 0;
5601
+
5602
+ for (uint32_t pass = 0; pass < numPasses; pass++) //Width
5565
5603
{
5566
5604
SIMDMode mode = typeSizeInBytes == 8 && blockWidth != 64 ? SIMDMode::SIMD4 : SIMDMode::SIMD8;
5567
5605
m_encoder->SetSimdSize(mode);
5568
5606
m_encoder->SetNoMask();
5569
- m_encoder->SetSrcSubVar(0, scale * (i + (blockHeight * pass)));
5607
+
5608
+ srcSubReg = (scale * (i + (blockHeight * pass)) * maxWidth) / getGRFSize();
5609
+ srcSubRegOffset = (i * maxWidth) % getGRFSize();
5610
+
5611
+ m_encoder->SetSrcSubVar(0, srcSubReg);
5612
+ m_encoder->SetSrcSubReg(0, srcSubRegOffset / typeSizeInBytes);
5613
+
5570
5614
m_encoder->SetDstSubVar(dstSubReg);
5571
- dstSubReg += scale;
5615
+ m_encoder->SetDstSubReg(dstSubRegOffset / typeSizeInBytes);
5616
+
5617
+ dstSubRegOffset = ((pass + 1) * maxWidth) % getGRFSize();
5618
+ if (dstSubRegOffset == 0)
5619
+ {
5620
+ dstSubReg += scale;
5621
+ }
5622
+
5572
5623
m_encoder->Copy(m_destination, pTempDest);
5573
5624
m_encoder->Push();
5574
5625
}
@@ -5641,8 +5692,10 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst)
5641
5692
int scale = (blockWidth == 64) ? 2 : 1;
5642
5693
for (pass = 0; pass < numPasses; pass++)
5643
5694
{
5644
- uint32_t srcSubVar = pass * scale;
5695
+ uint32_t srcSubVar = pass * scale * maxWidth / getGRFSize() ;
5645
5696
uint32_t dstSubVar = 0;
5697
+ uint32_t srcSubRegOffset = (pass * maxWidth) % getGRFSize();
5698
+ uint32_t dstSubRegOffset = 0;
5646
5699
5647
5700
CVariable* tempdst = nullptr;
5648
5701
tempdst = m_currShader->GetNewVariable(
@@ -5655,17 +5708,52 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst)
5655
5708
// mov (8) r23.0<1>:d r16.0<8;8,1>:d {Align1, Q1, Compacted}
5656
5709
// mov (8) r24.0<1>:d r18.0<8;8,1>:d {Align1, Q1, Compacted}
5657
5710
// mov (8) r25.0<1>:d r20.0<8;8,1>:d {Align1, Q1, Compacted}
5711
+
5712
+ //FOR 64 bytes GRF:
5713
+ // A0....A1....A2....A3........r60....r60.8....r61....r61.8
5714
+ // B0....B1....B2....B3........r62....r62.8....r63....r63.8
5715
+ // C0....C1....C2....C3........r64....r64.8....r65....r65.8
5716
+ // D0....D1....D2....D3........r66....r66.8....r67....r67.8
5717
+ // E0....E1....E2....E3........r68....r68.8....r69....r69.8
5718
+ // F0....F1....F2....F3........r70....r70.8....r71....r71.8
5719
+ // G0....G1....G2....G3........r72....r72.8....r73....r73.8
5720
+ // H0....H1....H2....H3........r74....r74.8....r75....r75.8
5721
+ //
5722
+ // block 0
5723
+ // mov (8) r20.0<1>:d r60.0<8;8,1>:d {Align1, Q1, Compacted}
5724
+ // mov (8) r20.8<1>:d r62.0<8;8,1>:d {Align1, Q1, Compacted}
5725
+ // mov (8) r21.0<1>:d r64.0<8;8,1>:d {Align1, Q1, Compacted}
5726
+ // mov (8) r21.8<1>:d rr66.0<8;8,1>:d {Align1, Q1, Compacted}
5727
+ // ...
5728
+ //block 1
5729
+ // mov (8) r30.0<1>:d r60.8<8;8,1>:d {Align1, Q1, Compacted}
5730
+ // mov (8) r30.8<1>:d r62.8<8;8,1>:d {Align1, Q1, Compacted}
5731
+ // mov (8) r31.0<1>:d r64.8<8;8,1>:d {Align1, Q1, Compacted}
5732
+ // mov (8) r31.8<1>:d rr66.8<8;8,1>:d {Align1, Q1, Compacted}
5733
+ //...
5734
+
5658
5735
if (numPasses > 1)
5659
5736
{
5660
5737
for (uint i = 0; i < nbElements; ++i)
5661
5738
{
5662
5739
SIMDMode mode = (typeSizeInBytes == 8 && blockWidth != 64) ? SIMDMode::SIMD4 : SIMDMode::SIMD8;
5663
5740
m_encoder->SetSimdSize(mode);
5664
5741
m_encoder->SetNoMask();
5742
+
5743
+ //Src
5665
5744
m_encoder->SetSrcSubVar(0, srcSubVar);
5745
+ m_encoder->SetSrcSubReg(0, srcSubRegOffset / typeSizeInBytes);
5746
+ //Dst
5666
5747
m_encoder->SetDstSubVar(dstSubVar);
5667
- dstSubVar += scale;
5668
- srcSubVar = srcSubVar + scale * numPasses;
5748
+ m_encoder->SetDstSubReg(dstSubRegOffset / typeSizeInBytes);
5749
+ //Strides for dst and src
5750
+ dstSubRegOffset = ((i + 1) * maxWidth) % getGRFSize();
5751
+ if (dstSubRegOffset == 0)
5752
+ {
5753
+ dstSubVar += scale;
5754
+ }
5755
+ srcSubVar = srcSubVar + (scale * numPasses * blockWidth / getGRFSize());
5756
+
5669
5757
m_encoder->Copy(tempdst, data);
5670
5758
m_encoder->Push();
5671
5759
}
0 commit comments