@@ -192,8 +192,7 @@ uint EmitPass::DecideInstanceAndSlice(llvm::BasicBlock& blk, SDAG& sdag, bool& s
192
192
if (StoreInst * ST = dyn_cast<StoreInst>(sdag.m_root))
193
193
{
194
194
// Limit to OpenCL so far as it has uniform load/store support.
195
- if (m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER &&
196
- isUniformStoreOCL(ST))
195
+ if (isUniformStoreOCL(ST))
197
196
numInstance = 1;
198
197
slicing = false;
199
198
}
@@ -14338,12 +14337,14 @@ void EmitPass::emitftoi(llvm::GenIntrinsicInst* inst)
14338
14337
// Return true if this store will be emit as uniform store
14339
14338
bool EmitPass::isUniformStoreOCL(llvm::StoreInst* SI)
14340
14339
{
14341
- if (!m_currShader->GetIsUniform(SI->getPointerOperand()))
14340
+ if (m_currShader->GetShaderType() != ShaderType::OPENCL_SHADER ||
14341
+ !m_currShader->GetIsUniform(SI->getPointerOperand()))
14342
14342
{
14343
14343
return false;
14344
14344
}
14345
14345
14346
- Type* Ty = SI->getValueOperand()->getType();
14346
+ Value* storeVal = SI->getValueOperand();
14347
+ Type* Ty = storeVal->getType();
14347
14348
VectorType* VTy = dyn_cast<VectorType>(Ty);
14348
14349
uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
14349
14350
Type* eltTy = VTy ? VTy->getElementType() : Ty;
@@ -14354,8 +14355,8 @@ bool EmitPass::isUniformStoreOCL(llvm::StoreInst* SI)
14354
14355
// Note that when elts > 1, VectorProcess make sure that its element
14355
14356
// size must be 4 or 8. Also, note that if totalBytes = 4, elts must be 1.
14356
14357
bool doUniformStore = (elts == 1 ||
14357
- (m_currShader->GetIsUniform(SI->getValueOperand() ) &&
14358
- (totalBytes == 8 || totalBytes == 12 || totalBytes == 16)));
14358
+ (m_currShader->GetIsUniform(storeVal ) &&
14359
+ (totalBytes == 8 || totalBytes == 12 || totalBytes == 16)));
14359
14360
return doUniformStore;
14360
14361
}
14361
14362
@@ -15685,6 +15686,215 @@ void EmitPass::emitVectorStore(StoreInst* inst, Value* offset, ConstantInt* immO
15685
15686
}
15686
15687
}
15687
15688
15689
+ // prepareAddressForUniform(): for both load and store
15690
+ // prepareDataForUniform(): for store only
15691
+ // Unaligned (less than 4 bytes) uniform load/store. One for address payload,
15692
+ // and the other for data payload.
15693
+ //
15694
+ // Example 1: "store <4xi32> V, <4xi32>* P, align 2"
15695
+ // A new pointer pVar is create with 4 elements.
15696
+ //
15697
+ // add (4|M0_NM) pVar<1>:ud P<0;1,0>:UD 0xC840:UV
15698
+ // send (4|M0_NM) pVar V
15699
+ //
15700
+ // prepareAddressForUniform() : create pVar
15701
+ // prepareDataForUniform() : return V (assuming V can be used directly)
15702
+ //
15703
+ // Example 2: "store <3xi32> V, <3xi32>* P, align 2"
15704
+ // Non-power of 2 vector size is rounded up to the next power of 2.
15705
+ // Additional elements are duplicated with the first vector element.
15706
+
15707
+ // add (4|M0_NM) pVar<1>:ud P<0;1,0>:UD 0x0840:UV
15708
+ // mov (4|M0_NM) vVar<1>:ud V<0;1,0>:ud
15709
+ // mov (2|M0_NM) vVar<1>:ud V<1;1,0>:ud
15710
+ // mov (1|M0_NM) vVar.2<1>:ud V.2<1;1,0>:ud
15711
+ // send (4|M0_NM) vVar pVar
15712
+ //
15713
+ // prepareAddressForUniform() : create pVar
15714
+ // prepareDataForUniform() : return vVar
15715
+ //
15716
+ // This function handles vector size up to 8. It also handles QW element size.
15717
+ // When vector size > 4, it uses 0x76543210, left-shifted by 2 (DW) or 3 (QW)
15718
+ // as an immediate to be added to 'AddrVar' to form a new address var.
15719
+ //
15720
+ // In addition, if 64bit add is not supported, emitAddPair() will be used to
15721
+ // use 32bit add/addc to emulate 64bit add.
15722
+ //
15723
+ // Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is its return var.
15724
+ // The argument 'DataVar' in prepareDataForUniform() is uniform, so is its return var.
15725
+ //
15726
+ CVariable* EmitPass::prepareAddressForUniform(
15727
+ CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t ExecSz, e_alignment Align)
15728
+ {
15729
+ IGC_ASSERT(NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15730
+ if (ExecSz == 1 && AddrVar->IsGRFAligned(Align))
15731
+ {
15732
+ return AddrVar;
15733
+ }
15734
+ bool isA64 = (AddrVar->GetElemSize() == 8);
15735
+ SIMDMode simdmode = lanesToSIMDMode(ExecSz);
15736
+ CVariable* newVar = m_currShader->GetNewVariable(ExecSz, AddrVar->GetType(), Align, true, CName::NONE);
15737
+
15738
+ CVariable* off;
15739
+ uint32_t incImm = (0x76543210 & maskTrailingOnes<uint32_t>(NElts * 4));
15740
+ if ((ExecSz <= 4 && EltBytes == 4) || (ExecSz <= 2 && EltBytes == 8))
15741
+ {
15742
+ // This case needs a single UV immediate
15743
+ incImm = incImm << (EltBytes == 4 ? 2 : 3);
15744
+ off = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
15745
+ }
15746
+ else
15747
+ {
15748
+ // Need a temporary var to calculate offsets
15749
+ off = m_currShader->GetNewVariable(ExecSz, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
15750
+
15751
+ // actualES is the actual execsize used for computing offsets.
15752
+ uint32_t actualES = (uint32_t)PowerOf2Ceil(NElts);
15753
+
15754
+ // incImm is UV type and can be used in execsize <= 8 only. If ExecSz is greater
15755
+ // than the actual number of lanes (for example, 4GRF alignment case), the upper lanes
15756
+ // beyond need to be zero'ed.
15757
+ if (ExecSz > actualES)
15758
+ {
15759
+ // Need to zero the upper lanes.
15760
+ m_encoder->SetNoMask();
15761
+ m_encoder->SetSimdSize(simdmode);
15762
+ m_encoder->Copy(off, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
15763
+ m_encoder->Push();
15764
+ }
15765
+
15766
+ SIMDMode sm = lanesToSIMDMode(actualES);
15767
+ if (incImm > 0 &&
15768
+ ((actualES <= 4 && EltBytes == 4) || (actualES <= 2 && EltBytes == 8)))
15769
+ {
15770
+ // This case needs a single UV immediate
15771
+ incImm = incImm << (EltBytes == 4 ? 2 : 3);
15772
+
15773
+ m_encoder->SetNoMask();
15774
+ m_encoder->SetSimdSize(sm);
15775
+ m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15776
+ m_encoder->Push();
15777
+ }
15778
+ else if (incImm > 0)
15779
+ {
15780
+ // Need a mov and mul
15781
+ m_encoder->SetNoMask();
15782
+ m_encoder->SetSimdSize(sm);
15783
+ m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15784
+ m_encoder->Push();
15785
+
15786
+ m_encoder->SetNoMask();
15787
+ m_encoder->SetSimdSize(sm);
15788
+ m_encoder->SetSrcRegion(0, 1, 1, 0);
15789
+ m_encoder->SetSrcRegion(1, 0, 1, 0);
15790
+ m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
15791
+ m_encoder->Push();
15792
+ }
15793
+ }
15794
+
15795
+ // May need splitting for A64
15796
+ bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15797
+ if (needSplit)
15798
+ {
15799
+ IGC_ASSERT(!off->IsImmediate());
15800
+ uint32_t bytes1 = (ExecSz / 2) * newVar->GetElemSize();
15801
+ uint32_t bytes2 = (ExecSz / 2) * off->GetElemSize();
15802
+ CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, ExecSz / 2);
15803
+ CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, ExecSz / 2);
15804
+ CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, ExecSz / 2);
15805
+ CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, ExecSz / 2);
15806
+
15807
+ if (m_currShader->m_Platform->hasNoInt64Inst())
15808
+ {
15809
+ emitAddPair(newVarHi, AddrVar, offHi);
15810
+ emitAddPair(newVarLo, AddrVar, offLo);
15811
+ }
15812
+ else
15813
+ {
15814
+ SIMDMode sm = lanesToSIMDMode(ExecSz / 2);
15815
+ m_encoder->SetNoMask();
15816
+ m_encoder->SetUniformSIMDSize(sm);
15817
+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15818
+ m_encoder->SetSrcRegion(1, 1, 1, 0);
15819
+ m_encoder->Add(newVarHi, AddrVar, offHi);
15820
+ m_encoder->Push();
15821
+
15822
+ m_encoder->SetNoMask();
15823
+ m_encoder->SetUniformSIMDSize(sm);
15824
+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15825
+ m_encoder->SetSrcRegion(1, 1, 1, 0);
15826
+ m_encoder->Add(newVarLo, AddrVar, offLo);
15827
+ m_encoder->Push();
15828
+ }
15829
+ }
15830
+ else if (isA64 && m_currShader->m_Platform->hasNoInt64Inst())
15831
+ {
15832
+ emitAddPair(newVar, AddrVar, off);
15833
+ }
15834
+ else
15835
+ {
15836
+ m_encoder->SetNoMask();
15837
+ m_encoder->SetUniformSIMDSize(simdmode);
15838
+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15839
+ m_encoder->SetSrcRegion(1, 1, 1, 0);
15840
+ m_encoder->Add(newVar, AddrVar, off);
15841
+ m_encoder->Push();
15842
+ }
15843
+ return newVar;
15844
+ }
15845
+
15846
+ CVariable* EmitPass::prepareDataForUniform(
15847
+ CVariable* DataVar, uint32_t ExecSz, e_alignment Align)
15848
+ {
15849
+ uint32_t NElts = DataVar->GetNumberElement();
15850
+ uint32_t EltBytes = DataVar->GetElemSize();
15851
+ IGC_ASSERT(ExecSz >= NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15852
+ if (NElts == ExecSz && !DataVar->IsImmediate() && DataVar->IsGRFAligned(Align))
15853
+ {
15854
+ return DataVar;
15855
+ }
15856
+ CVariable* newVar = m_currShader->GetNewVariable(ExecSz, DataVar->GetType(), Align, true, CName::NONE);
15857
+
15858
+ // Initialize to DataVar's first element (set Elts from NElts and up to the first element).
15859
+ bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15860
+ if (needSplit)
15861
+ {
15862
+ uint32_t esz = ExecSz / 2;
15863
+ uint32_t bytes = esz * newVar->GetElemSize();
15864
+ CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
15865
+ CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
15866
+
15867
+ m_encoder->SetNoMask();
15868
+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15869
+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15870
+ m_encoder->Copy(newVarHi, DataVar);
15871
+ m_encoder->Push();
15872
+
15873
+ m_encoder->SetNoMask();
15874
+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15875
+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15876
+ m_encoder->Copy(newVarLo, DataVar);
15877
+ m_encoder->Push();
15878
+ }
15879
+ else
15880
+ {
15881
+
15882
+ m_encoder->SetNoMask();
15883
+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(ExecSz));
15884
+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15885
+ m_encoder->Copy(newVar, DataVar);
15886
+ m_encoder->Push();
15887
+ }
15888
+
15889
+ if (!DataVar->IsImmediate() && NElts > 1)
15890
+ {
15891
+ // Copy values over, the elements from NElts to ExecSz-1 are set to the first element
15892
+ // in the initialization above.
15893
+ emitVectorCopy(newVar, DataVar, NElts);
15894
+ }
15895
+ return newVar;
15896
+ }
15897
+
15688
15898
15689
15899
void EmitPass::emitVectorCopy(CVariable* Dst, CVariable* Src, uint32_t nElts,
15690
15900
uint32_t DstSubRegOffset, uint32_t SrcSubRegOffset)
0 commit comments