Skip to content

Commit 480c168

Browse files
jgu222sys_zuul
authored andcommitted
Internal feature and fix.
Change-Id: Ie8fb87b4ba74c41f9eaabac43de50e9f0bc6501c
1 parent 854165c commit 480c168

File tree

3 files changed

+230
-17
lines changed

3 files changed

+230
-17
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 216 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,7 @@ uint EmitPass::DecideInstanceAndSlice(llvm::BasicBlock& blk, SDAG& sdag, bool& s
192192
if (StoreInst * ST = dyn_cast<StoreInst>(sdag.m_root))
193193
{
194194
// Limit to OpenCL so far as it has uniform load/store support.
195-
if (m_currShader->GetShaderType() == ShaderType::OPENCL_SHADER &&
196-
isUniformStoreOCL(ST))
195+
if (isUniformStoreOCL(ST))
197196
numInstance = 1;
198197
slicing = false;
199198
}
@@ -14338,12 +14337,14 @@ void EmitPass::emitftoi(llvm::GenIntrinsicInst* inst)
1433814337
// Return true if this store will be emit as uniform store
1433914338
bool EmitPass::isUniformStoreOCL(llvm::StoreInst* SI)
1434014339
{
14341-
if (!m_currShader->GetIsUniform(SI->getPointerOperand()))
14340+
if (m_currShader->GetShaderType() != ShaderType::OPENCL_SHADER ||
14341+
!m_currShader->GetIsUniform(SI->getPointerOperand()))
1434214342
{
1434314343
return false;
1434414344
}
1434514345

14346-
Type* Ty = SI->getValueOperand()->getType();
14346+
Value* storeVal = SI->getValueOperand();
14347+
Type* Ty = storeVal->getType();
1434714348
VectorType* VTy = dyn_cast<VectorType>(Ty);
1434814349
uint32_t elts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
1434914350
Type* eltTy = VTy ? VTy->getElementType() : Ty;
@@ -14354,8 +14355,8 @@ bool EmitPass::isUniformStoreOCL(llvm::StoreInst* SI)
1435414355
// Note that when elts > 1, VectorProcess make sure that its element
1435514356
// size must be 4 or 8. Also, note that if totalBytes = 4, elts must be 1.
1435614357
bool doUniformStore = (elts == 1 ||
14357-
(m_currShader->GetIsUniform(SI->getValueOperand()) &&
14358-
(totalBytes == 8 || totalBytes == 12 || totalBytes == 16)));
14358+
(m_currShader->GetIsUniform(storeVal) &&
14359+
(totalBytes == 8 || totalBytes == 12 || totalBytes == 16)));
1435914360
return doUniformStore;
1436014361
}
1436114362

@@ -15685,6 +15686,215 @@ void EmitPass::emitVectorStore(StoreInst* inst, Value* offset, ConstantInt* immO
1568515686
}
1568615687
}
1568715688

15689+
// prepareAddressForUniform(): for both load and store
15690+
// prepareDataForUniform(): for store only
15691+
// Unaligned (less than 4 bytes) uniform load/store. One for address payload,
15692+
// and the other for data payload.
15693+
//
15694+
// Example 1: "store <4xi32> V, <4xi32>* P, align 2"
15695+
// A new pointer pVar is create with 4 elements.
15696+
//
15697+
// add (4|M0_NM) pVar<1>:ud P<0;1,0>:UD 0xC840:UV
15698+
// send (4|M0_NM) pVar V
15699+
//
15700+
// prepareAddressForUniform() : create pVar
15701+
// prepareDataForUniform() : return V (assuming V can be used directly)
15702+
//
15703+
// Example 2: "store <3xi32> V, <3xi32>* P, align 2"
15704+
// Non-power of 2 vector size is rounded up to the next power of 2.
15705+
// Additional elements are duplicated with the first vector element.
15706+
15707+
// add (4|M0_NM) pVar<1>:ud P<0;1,0>:UD 0x0840:UV
15708+
// mov (4|M0_NM) vVar<1>:ud V<0;1,0>:ud
15709+
// mov (2|M0_NM) vVar<1>:ud V<1;1,0>:ud
15710+
// mov (1|M0_NM) vVar.2<1>:ud V.2<1;1,0>:ud
15711+
// send (4|M0_NM) vVar pVar
15712+
//
15713+
// prepareAddressForUniform() : create pVar
15714+
// prepareDataForUniform() : return vVar
15715+
//
15716+
// This function handles vector size up to 8. It also handles QW element size.
15717+
// When vector size > 4, it uses 0x76543210, left-shifted by 2 (DW) or 3 (QW)
15718+
// as an immediate to be added to 'AddrVar' to form a new address var.
15719+
//
15720+
// In addition, if 64bit add is not supported, emitAddPair() will be used to
15721+
// use 32bit add/addc to emulate 64bit add.
15722+
//
15723+
// Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is its return var.
15724+
// The argument 'DataVar' in prepareDataForUniform() is uniform, so is its return var.
15725+
//
15726+
CVariable* EmitPass::prepareAddressForUniform(
15727+
CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t ExecSz, e_alignment Align)
15728+
{
15729+
IGC_ASSERT(NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15730+
if (ExecSz == 1 && AddrVar->IsGRFAligned(Align))
15731+
{
15732+
return AddrVar;
15733+
}
15734+
bool isA64 = (AddrVar->GetElemSize() == 8);
15735+
SIMDMode simdmode = lanesToSIMDMode(ExecSz);
15736+
CVariable* newVar = m_currShader->GetNewVariable(ExecSz, AddrVar->GetType(), Align, true, CName::NONE);
15737+
15738+
CVariable* off;
15739+
uint32_t incImm = (0x76543210 & maskTrailingOnes<uint32_t>(NElts * 4));
15740+
if ((ExecSz <= 4 && EltBytes == 4) || (ExecSz <= 2 && EltBytes == 8))
15741+
{
15742+
// This case needs a single UV immediate
15743+
incImm = incImm << (EltBytes == 4 ? 2 : 3);
15744+
off = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
15745+
}
15746+
else
15747+
{
15748+
// Need a temporary var to calculate offsets
15749+
off = m_currShader->GetNewVariable(ExecSz, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
15750+
15751+
// actualES is the actual execsize used for computing offsets.
15752+
uint32_t actualES = (uint32_t)PowerOf2Ceil(NElts);
15753+
15754+
// incImm is UV type and can be used in execsize <= 8 only. If ExecSz is greater
15755+
// than the actual number of lanes (for example, 4GRF alignment case), the upper lanes
15756+
// beyond need to be zero'ed.
15757+
if (ExecSz > actualES)
15758+
{
15759+
// Need to zero the upper lanes.
15760+
m_encoder->SetNoMask();
15761+
m_encoder->SetSimdSize(simdmode);
15762+
m_encoder->Copy(off, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
15763+
m_encoder->Push();
15764+
}
15765+
15766+
SIMDMode sm = lanesToSIMDMode(actualES);
15767+
if (incImm > 0 &&
15768+
((actualES <= 4 && EltBytes == 4) || (actualES <= 2 && EltBytes == 8)))
15769+
{
15770+
// This case needs a single UV immediate
15771+
incImm = incImm << (EltBytes == 4 ? 2 : 3);
15772+
15773+
m_encoder->SetNoMask();
15774+
m_encoder->SetSimdSize(sm);
15775+
m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15776+
m_encoder->Push();
15777+
}
15778+
else if (incImm > 0)
15779+
{
15780+
// Need a mov and mul
15781+
m_encoder->SetNoMask();
15782+
m_encoder->SetSimdSize(sm);
15783+
m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15784+
m_encoder->Push();
15785+
15786+
m_encoder->SetNoMask();
15787+
m_encoder->SetSimdSize(sm);
15788+
m_encoder->SetSrcRegion(0, 1, 1, 0);
15789+
m_encoder->SetSrcRegion(1, 0, 1, 0);
15790+
m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
15791+
m_encoder->Push();
15792+
}
15793+
}
15794+
15795+
// May need splitting for A64
15796+
bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15797+
if (needSplit)
15798+
{
15799+
IGC_ASSERT(!off->IsImmediate());
15800+
uint32_t bytes1 = (ExecSz / 2) * newVar->GetElemSize();
15801+
uint32_t bytes2 = (ExecSz / 2) * off->GetElemSize();
15802+
CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, ExecSz / 2);
15803+
CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, ExecSz / 2);
15804+
CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, ExecSz / 2);
15805+
CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, ExecSz / 2);
15806+
15807+
if (m_currShader->m_Platform->hasNoInt64Inst())
15808+
{
15809+
emitAddPair(newVarHi, AddrVar, offHi);
15810+
emitAddPair(newVarLo, AddrVar, offLo);
15811+
}
15812+
else
15813+
{
15814+
SIMDMode sm = lanesToSIMDMode(ExecSz / 2);
15815+
m_encoder->SetNoMask();
15816+
m_encoder->SetUniformSIMDSize(sm);
15817+
m_encoder->SetSrcRegion(0, 0, 1, 0);
15818+
m_encoder->SetSrcRegion(1, 1, 1, 0);
15819+
m_encoder->Add(newVarHi, AddrVar, offHi);
15820+
m_encoder->Push();
15821+
15822+
m_encoder->SetNoMask();
15823+
m_encoder->SetUniformSIMDSize(sm);
15824+
m_encoder->SetSrcRegion(0, 0, 1, 0);
15825+
m_encoder->SetSrcRegion(1, 1, 1, 0);
15826+
m_encoder->Add(newVarLo, AddrVar, offLo);
15827+
m_encoder->Push();
15828+
}
15829+
}
15830+
else if (isA64 && m_currShader->m_Platform->hasNoInt64Inst())
15831+
{
15832+
emitAddPair(newVar, AddrVar, off);
15833+
}
15834+
else
15835+
{
15836+
m_encoder->SetNoMask();
15837+
m_encoder->SetUniformSIMDSize(simdmode);
15838+
m_encoder->SetSrcRegion(0, 0, 1, 0);
15839+
m_encoder->SetSrcRegion(1, 1, 1, 0);
15840+
m_encoder->Add(newVar, AddrVar, off);
15841+
m_encoder->Push();
15842+
}
15843+
return newVar;
15844+
}
15845+
15846+
CVariable* EmitPass::prepareDataForUniform(
15847+
CVariable* DataVar, uint32_t ExecSz, e_alignment Align)
15848+
{
15849+
uint32_t NElts = DataVar->GetNumberElement();
15850+
uint32_t EltBytes = DataVar->GetElemSize();
15851+
IGC_ASSERT(ExecSz >= NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15852+
if (NElts == ExecSz && !DataVar->IsImmediate() && DataVar->IsGRFAligned(Align))
15853+
{
15854+
return DataVar;
15855+
}
15856+
CVariable* newVar = m_currShader->GetNewVariable(ExecSz, DataVar->GetType(), Align, true, CName::NONE);
15857+
15858+
// Initialize to DataVar's first element (set Elts from NElts and up to the first element).
15859+
bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15860+
if (needSplit)
15861+
{
15862+
uint32_t esz = ExecSz / 2;
15863+
uint32_t bytes = esz * newVar->GetElemSize();
15864+
CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
15865+
CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
15866+
15867+
m_encoder->SetNoMask();
15868+
m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15869+
m_encoder->SetSrcRegion(0, 0, 1, 0);
15870+
m_encoder->Copy(newVarHi, DataVar);
15871+
m_encoder->Push();
15872+
15873+
m_encoder->SetNoMask();
15874+
m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15875+
m_encoder->SetSrcRegion(0, 0, 1, 0);
15876+
m_encoder->Copy(newVarLo, DataVar);
15877+
m_encoder->Push();
15878+
}
15879+
else
15880+
{
15881+
15882+
m_encoder->SetNoMask();
15883+
m_encoder->SetUniformSIMDSize(lanesToSIMDMode(ExecSz));
15884+
m_encoder->SetSrcRegion(0, 0, 1, 0);
15885+
m_encoder->Copy(newVar, DataVar);
15886+
m_encoder->Push();
15887+
}
15888+
15889+
if (!DataVar->IsImmediate() && NElts > 1)
15890+
{
15891+
// Copy values over, the elements from NElts to ExecSz-1 are set to the first element
15892+
// in the initialization above.
15893+
emitVectorCopy(newVar, DataVar, NElts);
15894+
}
15895+
return newVar;
15896+
}
15897+
1568815898

1568915899
void EmitPass::emitVectorCopy(CVariable* Dst, CVariable* Src, uint32_t nElts,
1569015900
uint32_t DstSubRegOffset, uint32_t SrcSubRegOffset)

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,11 @@ class EmitPass : public llvm::FunctionPass
672672
bool isHalfGRFReturn(CVariable* dst, SIMDMode simdMode);
673673

674674
void emitFeedbackEnable();
675+
676+
// used for loading/storing uniform value using scatter/gather messages.
677+
CVariable* prepareAddressForUniform(
678+
CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t ExecSz, e_alignment Align);
679+
CVariable* prepareDataForUniform(CVariable* DataVar, uint32_t ExecSz, e_alignment Align);
675680
};
676681

677682
} // namespace IGC

IGC/Compiler/CISACodeGen/VectorProcess.cpp

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,6 @@ bool VectorProcess::reLayoutLoadStore(Instruction* Inst)
201201

202202
Value* Ptr = nullptr;
203203
Type* Ty = nullptr;
204-
205204
if (nullptr != LI)
206205
{
207206
Ptr = LI->getPointerOperand();
@@ -254,8 +253,8 @@ bool VectorProcess::reLayoutLoadStore(Instruction* Inst)
254253

255254
//
256255
// Assumption:
257-
// 1. if vector size < 4 bytes, it must be 1 or 2 bytes (never 3);
258-
// 2. if vector size >= 4 bytes, it must be multiple of DW
256+
// 1. if the size of vector < 4 bytes, it must be 1 or 2 bytes (never 3);
257+
// 2. if the size of vector >= 4 bytes, it must be multiple of DW
259258
// Those 2 assumption are guaranteed by VectorPreProcess.
260259
//
261260
// So far, we are using A32 untyped and byte scattered messages,
@@ -301,15 +300,14 @@ bool VectorProcess::reLayoutLoadStore(Instruction* Inst)
301300
{
302301
align = LI->getAlignment();
303302
}
303+
else if (SI)
304+
{
305+
align = SI->getAlignment();
306+
}
304307
else
305-
if (SI)
306-
{
307-
align = SI->getAlignment();
308-
}
309-
else
310-
{
311-
align = 1;
312-
}
308+
{
309+
align = 1;
310+
}
313311

314312
bool useQW = useA64 && ((TBytes % 8) == 0) &&
315313
((has_8Byte_A64_BS && align < 4) || (eTyBytes == 8U && align >= 8U));

0 commit comments

Comments
 (0)