Skip to content

Commit f6b3af5

Browse files
traoux1gfxbot
traoux1
authored andcommitted
Emit uniform private memory accesses in case alloca was detected
as uniform Change-Id: I49ab634415e22f3b193f5f53196ed77b253d8b57
1 parent 93058a1 commit f6b3af5

File tree

2 files changed

+20
-11
lines changed

2 files changed

+20
-11
lines changed

IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,14 @@ unsigned int LowerGEPForPrivMem::extractAllocaSize(llvm::AllocaInst* pAlloca)
199199

200200
bool LowerGEPForPrivMem::CheckIfAllocaPromotable(llvm::AllocaInst* pAlloca)
201201
{
202+
auto WI = &getAnalysis<WIAnalysis>();
203+
bool isUniformAlloca = WI->whichDepend(pAlloca) == WIAnalysis::UNIFORM;
204+
if(isUniformAlloca)
205+
{
206+
IRBuilder<> builder(pAlloca);
207+
MDNode* node = MDNode::get(pAlloca->getContext(), ConstantAsMetadata::get(builder.getInt1(true)));
208+
pAlloca->setMetadata("uniform", node);
209+
}
202210
unsigned int allocaSize = extractAllocaSize(pAlloca);
203211
unsigned int allowedAllocaSizeInBytes = MAX_ALLOCA_PROMOTE_GRF_NUM * 4;
204212

@@ -219,8 +227,6 @@ bool LowerGEPForPrivMem::CheckIfAllocaPromotable(llvm::AllocaInst* pAlloca)
219227
{
220228
return false;
221229
}
222-
auto WI = &getAnalysis<WIAnalysis>();
223-
bool isUniformAlloca = WI->whichDepend(pAlloca) == WIAnalysis::UNIFORM;
224230
if(isUniformAlloca)
225231
{
226232
// Heuristic: for uniform alloca we divide the size by 8 to adjust the pressure

IGC/Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryResolution.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -750,12 +750,12 @@ bool PrivateMemoryResolution::resolveAllocaInstuctions(bool stackCall)
750750
// Creates intrinsics that will be lowered in the CodeGen and will handle the stack-pointer
751751
Function *stackAllocaFunc = GenISAIntrinsic::getDeclaration(m_currFunction->getParent(), GenISAIntrinsic::GenISA_StackAlloca);
752752
Instruction *simdLaneId16 = CallInst::Create(simdLaneIdFunc, VALUE_NAME("simdLaneId16"), pEntryPoint);
753-
Instruction *simdLaneId = ZExtInst::CreateIntegerCast(simdLaneId16, typeInt32, false, VALUE_NAME("simdLaneId"), pEntryPoint);
753+
Value *simdLaneId = ZExtInst::CreateIntegerCast(simdLaneId16, typeInt32, false, VALUE_NAME("simdLaneId"), pEntryPoint);
754754
Instruction *simdSize = CallInst::Create(simdSizeFunc, VALUE_NAME("simdSize"), pEntryPoint);
755755
for (auto pAI : allocaInsts)
756756
{
757757
assert(!pAI->use_empty() && "Should not reach here with alloca instruction that has no usage!");
758-
758+
bool isUniform = pAI->getMetadata("uniform") != nullptr;
759759
llvm::IRBuilder<> builder(pAI);
760760
IF_DEBUG_INFO(builder.SetCurrentDebugLocation(emptyDebugLoc));
761761

@@ -764,7 +764,8 @@ bool PrivateMemoryResolution::resolveAllocaInstuctions(bool stackCall)
764764
unsigned int bufferSize = m_ModAllocaInfo->getBufferSize(pAI);
765765

766766
Value* bufferOffset = builder.CreateMul(simdSize, ConstantInt::get(typeInt32, scalarBufferOffset), VALUE_NAME(pAI->getName() + ".SIMDBufferOffset"));
767-
Value* perLaneOffset = builder.CreateMul(simdLaneId, ConstantInt::get(typeInt32, bufferSize), VALUE_NAME("perLaneOffset"));
767+
Value* increment = isUniform ? builder.getInt32(0) : simdLaneId;
768+
Value* perLaneOffset = builder.CreateMul(increment, ConstantInt::get(typeInt32, bufferSize), VALUE_NAME("perLaneOffset"));
768769
Value* totalOffset = builder.CreateAdd(bufferOffset, perLaneOffset, VALUE_NAME(pAI->getName() + ".totalOffset"));
769770
Value* stackAlloca = builder.CreateCall(stackAllocaFunc, totalOffset, VALUE_NAME("stackAlloca"));
770771
Value* privateBuffer = builder.CreatePointerCast(stackAlloca, pAI->getType(), VALUE_NAME(pAI->getName() + ".privateBuffer"));
@@ -786,7 +787,7 @@ bool PrivateMemoryResolution::resolveAllocaInstuctions(bool stackCall)
786787
// PrivateMemoryUsageAnalysis pass, no need to run AddImplicitArgs pass.
787788

788789
Instruction *simdLaneId16 = CallInst::Create(simdLaneIdFunc, VALUE_NAME("simdLaneId16"), pEntryPoint);
789-
Instruction *simdLaneId = ZExtInst::CreateIntegerCast(simdLaneId16, typeInt32, false, VALUE_NAME("simdLaneId"), pEntryPoint);
790+
Value *simdLaneId = ZExtInst::CreateIntegerCast(simdLaneId16, typeInt32, false, VALUE_NAME("simdLaneId"), pEntryPoint);
790791
Instruction *simdSize = CallInst::Create(simdSizeFunc, VALUE_NAME("simdSize"), pEntryPoint);
791792

792793
Argument* r0Arg = implicitArgs.getArgInFunc(*m_currFunction, ImplicitArg::R0);
@@ -797,7 +798,7 @@ bool PrivateMemoryResolution::resolveAllocaInstuctions(bool stackCall)
797798
for (auto pAI : allocaInsts)
798799
{
799800
assert(!pAI->use_empty() && "Should not reach here with alloca instruction that has no usage!");
800-
801+
bool isUniform = pAI->getMetadata("uniform") != nullptr;
801802
llvm::IRBuilder<> builder(pAI);
802803
// Post upgrade to LLVM 3.5.1, it was found that inliner propagates debug info of callee
803804
// in to the alloca. Further, those allocas are somehow hoisted to the top of program.
@@ -857,7 +858,8 @@ bool PrivateMemoryResolution::resolveAllocaInstuctions(bool stackCall)
857858

858859

859860
Value* bufferOffset = builder.CreateMul(simdSize, ConstantInt::get(typeInt32, scalarBufferOffset), VALUE_NAME(pAI->getName() + ".SIMDBufferOffset"));
860-
Value* perLaneOffset = builder.CreateMul(simdLaneId, ConstantInt::get(typeInt32, bufferSize), VALUE_NAME("perLaneOffset"));
861+
Value* perLaneOffset = isUniform ? builder.getInt32(0) : simdLaneId;
862+
perLaneOffset = builder.CreateMul(perLaneOffset, ConstantInt::get(typeInt32, bufferSize), VALUE_NAME("perLaneOffset"));
861863
Value* totalOffset = builder.CreateAdd(bufferOffset, perLaneOffset, VALUE_NAME(pAI->getName() + ".totalOffset"));
862864
Value* threadOffset = builder.CreateAdd(privateBase, totalOffset, VALUE_NAME(pAI->getName() + ".threadOffset"));
863865
Value* privateBufferPTR = builder.CreateIntToPtr(threadOffset, Type::getInt8Ty(C)->getPointerTo(ADDRESS_SPACE_PRIVATE), VALUE_NAME(pAI->getName() + ".privateBufferPTR"));
@@ -903,7 +905,7 @@ bool PrivateMemoryResolution::resolveAllocaInstuctions(bool stackCall)
903905
ConstantInt *totalPrivateMemPerWIValue = ConstantInt::get(typeInt32, totalPrivateMemPerWI);
904906

905907
Instruction *simdLaneId16 = CallInst::Create(simdLaneIdFunc, VALUE_NAME("simdLaneId16"), pEntryPoint);
906-
Instruction *simdLaneId = ZExtInst::CreateIntegerCast(simdLaneId16, typeInt32, false, VALUE_NAME("simdLaneId"), pEntryPoint);
908+
Value* simdLaneId = ZExtInst::CreateIntegerCast(simdLaneId16, typeInt32, false, VALUE_NAME("simdLaneId"), pEntryPoint);
907909
Instruction *simdSize = CallInst::Create(simdSizeFunc, VALUE_NAME("simdSize"), pEntryPoint);
908910
BinaryOperator* totalPrivateMemPerThread = BinaryOperator::CreateMul(simdSize, totalPrivateMemPerWIValue, VALUE_NAME("totalPrivateMemPerThread"), pEntryPoint);
909911
ExtractElementInst* r0_5 = ExtractElementInst::Create(r0Arg, ConstantInt::get(typeInt32, 5), VALUE_NAME("r0.5"), pEntryPoint);
@@ -936,14 +938,15 @@ bool PrivateMemoryResolution::resolveAllocaInstuctions(bool stackCall)
936938

937939
llvm::IRBuilder<> builder(pAI);
938940
IF_DEBUG_INFO(builder.SetCurrentDebugLocation(emptyDebugLoc));
939-
941+
bool isUniform = pAI->getMetadata("uniform") != nullptr;
940942
// Get buffer information from the analysis
941943
unsigned int scalarBufferOffset = m_ModAllocaInfo->getBufferOffset(pAI);
942944
unsigned int bufferSize = m_ModAllocaInfo->getBufferSize(pAI);
943945

944946
Value* bufferOffset = builder.CreateMul(simdSize, ConstantInt::get(typeInt32, scalarBufferOffset), VALUE_NAME(pAI->getName() + ".SIMDBufferOffset"));
945947
Value* bufferOffsetForThread = builder.CreateAdd(perThreadOffset, bufferOffset, VALUE_NAME(pAI->getName() + ".bufferOffsetForThread"));
946-
Value* perLaneOffset = builder.CreateMul(simdLaneId, ConstantInt::get(typeInt32, bufferSize), VALUE_NAME("perLaneOffset"));
948+
Value* perLaneOffset = isUniform ? builder.getInt32(0) : simdLaneId;
949+
perLaneOffset = builder.CreateMul(simdLaneId, ConstantInt::get(typeInt32, bufferSize), VALUE_NAME("perLaneOffset"));
947950
Value* totalOffset = builder.CreateAdd(bufferOffsetForThread, perLaneOffset, VALUE_NAME(pAI->getName() + ".totalOffset"));
948951
Value* privateBufferGEP = builder.CreateGEP(privateMemArg, totalOffset, VALUE_NAME(pAI->getName() + ".privateBufferGEP"));
949952
Value* privateBuffer = builder.CreatePointerCast(privateBufferGEP, pAI->getType(), VALUE_NAME(pAI->getName() + ".privateBuffer"));

0 commit comments

Comments
 (0)