Skip to content

Commit f1624b5

Browse files
Gang Y Chenigcbot
authored andcommitted
Try to find the case that scalarizing phi negaively
affects performance.
1 parent b427d8f commit f1624b5

File tree

5 files changed

+162
-180
lines changed

5 files changed

+162
-180
lines changed

IGC/AdaptorOCL/UnifyIROCL.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -510,10 +510,10 @@ static void CommonOCLBasedPasses(
510510
mpm.add(createSROAPass());
511511
mpm.add(createIGCInstructionCombiningPass());
512512

513-
// "false" to createScalarizerPass() means that vector load/stores are NOT scalarized
513+
// true means selective scalarization
514514
if (IGC_IS_FLAG_ENABLED(DisableScalarizerGPGPU) == false)
515515
{
516-
mpm.add(createScalarizerPass(false));
516+
mpm.add(createScalarizerPass(true));
517517
}
518518

519519
// Create a dummy kernel to attach the symbol table if necessary

IGC/Compiler/Optimizer/Scalarizer.cpp

Lines changed: 124 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,12 @@ IGC_INITIALIZE_PASS_END(ScalarizeFunction, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG
6969

7070
char ScalarizeFunction::ID = 0;
7171

72-
ScalarizeFunction::ScalarizeFunction(bool scalarizingVectorLDSTType) : FunctionPass(ID)
72+
ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass(ID)
7373
{
7474
initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry());
7575

7676
for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0;
77-
m_ScalarizingVectorLDSTType = scalarizingVectorLDSTType;
77+
m_SelectiveScalarization = selectiveScalarization;
7878

7979
// Initialize SCM buffers and allocation
8080
m_SCMAllocationArray = new SCMEntry[ESTIMATED_INST_NUM];
@@ -121,6 +121,13 @@ bool ScalarizeFunction::runOnFunction(Function& F)
121121
m_SCM.clear();
122122
releaseAllSCMEntries();
123123
m_DRL.clear();
124+
m_Excludes.clear();
125+
126+
// collecting instructions that we want to avoid scalarization
127+
if (m_SelectiveScalarization)
128+
{
129+
buildExclusiveSet();
130+
}
124131

125132
// Scalarization. Iterate over all the instructions
126133
// Always hold the iterator at the instruction following the one being scalarized (so the
@@ -132,7 +139,14 @@ bool ScalarizeFunction::runOnFunction(Function& F)
132139
Instruction* currInst = &*sI;
133140
// Move iterator to next instruction BEFORE scalarizing current instruction
134141
++sI;
135-
dispatchInstructionToScalarize(currInst);
142+
if (m_Excludes.count(currInst))
143+
{
144+
recoverNonScalarizableInst(currInst);
145+
}
146+
else
147+
{
148+
dispatchInstructionToScalarize(currInst);
149+
}
136150
}
137151

138152
resolveVectorValues();
@@ -161,6 +175,111 @@ bool ScalarizeFunction::runOnFunction(Function& F)
161175
return true;
162176
}
163177

178+
void ScalarizeFunction::buildExclusiveSet()
179+
{
180+
inst_iterator sI = inst_begin(m_currFunc);
181+
inst_iterator sE = inst_end(m_currFunc);
182+
std::vector<llvm::Value*> workset;
183+
while (sI != sE)
184+
{
185+
Instruction* currInst = &*sI;
186+
++sI;
187+
if (CallInst* CI = dyn_cast<CallInst>(currInst))
188+
{
189+
unsigned numOperands = CI->getNumArgOperands();
190+
for (unsigned i = 0; i < numOperands; i++)
191+
{
192+
Value* operand = CI->getArgOperand(i);
193+
if (isa<VectorType>(operand->getType()))
194+
{
195+
workset.push_back(operand);
196+
}
197+
}
198+
}
199+
else if (auto IEI = dyn_cast<InsertElementInst>(currInst))
200+
{
201+
Value* scalarIndexVal = IEI->getOperand(2);
202+
// If the index is not a constant - we cannot statically remove this inst
203+
if (!isa<ConstantInt>(scalarIndexVal)) {
204+
workset.push_back(IEI);
205+
}
206+
}
207+
else if (auto EEI = dyn_cast<ExtractElementInst>(currInst))
208+
{
209+
Value* scalarIndexVal = EEI->getOperand(1);
210+
// If the index is not a constant - we cannot statically remove this inst
211+
if (!isa<ConstantInt>(scalarIndexVal)) {
212+
workset.push_back(EEI->getOperand(0));
213+
}
214+
}
215+
}
216+
while (!workset.empty())
217+
{
218+
auto Def = workset.back();
219+
workset.pop_back();
220+
if (m_Excludes.count(Def))
221+
{
222+
continue;
223+
}
224+
if (auto IEI = dyn_cast<InsertElementInst>(Def))
225+
{
226+
m_Excludes.insert(IEI);
227+
if (!m_Excludes.count(IEI->getOperand(0)) &&
228+
(isa<PHINode>(IEI->getOperand(0)) ||
229+
isa<ShuffleVectorInst>(IEI->getOperand(0)) ||
230+
isa<InsertElementInst>(IEI->getOperand(0))))
231+
{
232+
workset.push_back(IEI->getOperand(0));
233+
}
234+
}
235+
else if (auto SVI = dyn_cast<ShuffleVectorInst>(Def))
236+
{
237+
m_Excludes.insert(SVI);
238+
if (!m_Excludes.count(SVI->getOperand(0)) &&
239+
(isa<PHINode>(SVI->getOperand(0)) ||
240+
isa<ShuffleVectorInst>(SVI->getOperand(0)) ||
241+
isa<InsertElementInst>(SVI->getOperand(0))))
242+
{
243+
workset.push_back(SVI->getOperand(0));
244+
}
245+
if (!m_Excludes.count(SVI->getOperand(1)) &&
246+
(isa<PHINode>(SVI->getOperand(1)) ||
247+
isa<ShuffleVectorInst>(SVI->getOperand(1)) ||
248+
isa<InsertElementInst>(SVI->getOperand(1))))
249+
{
250+
workset.push_back(SVI->getOperand(1));
251+
}
252+
}
253+
else if (auto PHI = dyn_cast<PHINode>(Def))
254+
{
255+
m_Excludes.insert(PHI);
256+
for (int i = 0, n = PHI->getNumOperands(); i < n; ++i)
257+
if (!m_Excludes.count(PHI->getOperand(i)) &&
258+
(isa<PHINode>(PHI->getOperand(i)) ||
259+
isa<ShuffleVectorInst>(PHI->getOperand(i)) ||
260+
isa<InsertElementInst>(PHI->getOperand(i))))
261+
{
262+
workset.push_back(PHI->getOperand(i));
263+
}
264+
}
265+
else
266+
{
267+
continue;
268+
}
269+
// check use
270+
for (auto U : Def->users())
271+
{
272+
if (!m_Excludes.count(U) &&
273+
(isa<PHINode>(U) ||
274+
isa<ShuffleVectorInst>(U) ||
275+
isa<InsertElementInst>(U)))
276+
{
277+
workset.push_back(U);
278+
}
279+
}
280+
}
281+
}
282+
164283
void ScalarizeFunction::dispatchInstructionToScalarize(Instruction* I)
165284
{
166285
V_PRINT(scalarizer, "\tScalarizing Instruction: " << *I << "\n");
@@ -235,13 +354,6 @@ void ScalarizeFunction::dispatchInstructionToScalarize(Instruction* I)
235354
case Instruction::GetElementPtr:
236355
scalarizeInstruction(dyn_cast<GetElementPtrInst>(I));
237356
break;
238-
case Instruction::Load:
239-
scalarizeInstruction(dyn_cast<LoadInst>(I));
240-
break;
241-
case Instruction::Store:
242-
scalarizeInstruction(dyn_cast<StoreInst>(I));
243-
break;
244-
245357
// The remaining instructions are not supported for scalarization. Keep "as is"
246358
default:
247359
recoverNonScalarizableInst(I);
@@ -892,149 +1004,6 @@ void ScalarizeFunction::scalarizeInstruction(GetElementPtrInst* GI)
8921004
m_removedInsts.insert(GI);
8931005
}
8941006

895-
void ScalarizeFunction::scalarizeInstruction(LoadInst* LI)
896-
{
897-
V_PRINT(scalarizer, "\t\tLoad instruction\n");
898-
IGC_ASSERT_MESSAGE(LI, "instruction type dynamic cast failed");
899-
900-
VectorType* dataType = dyn_cast<VectorType>(LI->getType());
901-
if (isScalarizableLoadStoreType(dataType) && m_pDL)
902-
{
903-
// Prepare empty SCM entry for the instruction
904-
SCMEntry* newEntry = getSCMEntry(LI);
905-
906-
// Get additional info from instruction
907-
unsigned int vectorSize = int_cast<unsigned int>(m_pDL->getTypeAllocSize(dataType));
908-
unsigned int elementSize = int_cast<unsigned int>(m_pDL->getTypeSizeInBits(dataType->getElementType()) / 8);
909-
IGC_ASSERT(elementSize);
910-
IGC_ASSERT_MESSAGE((vectorSize / elementSize > 0), "vector size should be a multiply of element size");
911-
IGC_ASSERT_MESSAGE((vectorSize % elementSize == 0), "vector size should be a multiply of element size");
912-
unsigned numDupElements = int_cast<unsigned>(dataType->getNumElements());
913-
914-
// Obtain scalarized arguments
915-
// 1 - to allow scalarizing Load with any pointer type
916-
// 0 - to limit scalarizing to special case where packetizer benifit from the scalarizing
917-
#if 1
918-
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
919-
Value * GepPtr = LI->getOperand(0);
920-
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
921-
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", LI);
922-
Type* indexType = Type::getInt32Ty(*m_moduleContext);
923-
// Generate new (scalar) instructions
924-
SmallVector<Value*, MAX_INPUT_VECTOR_WIDTH>newScalarizedInsts;
925-
newScalarizedInsts.resize(numDupElements);
926-
for (unsigned dup = 0; dup < numDupElements; dup++)
927-
{
928-
Constant* laneVal = ConstantInt::get(indexType, dup);
929-
Value* pGEP = GetElementPtrInst::Create(nullptr, operandBase, laneVal, "GEP_lane", LI);
930-
newScalarizedInsts[dup] = new LoadInst(pGEP->getType()->getPointerElementType(), pGEP, LI->getName(), LI);
931-
}
932-
#else
933-
GetElementPtrInst* operand = dyn_cast<GetElementPtrInst>(LI->getOperand(0));
934-
if (!operand || operand->getNumIndices() != 1)
935-
{
936-
return recoverNonScalarizableInst(LI);
937-
}
938-
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
939-
Value* GepPtr = operand->getPointerOperand();
940-
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
941-
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", LI);
942-
Type* indexType = operand->getOperand(1)->getType();
943-
// Generate new (scalar) instructions
944-
Value* newScalarizedInsts[MAX_INPUT_VECTOR_WIDTH];
945-
Constant* elementNumVal = ConstantInt::get(indexType, numElements);
946-
for (unsigned dup = 0; dup < numDupElements; dup++)
947-
{
948-
Constant* laneVal = ConstantInt::get(indexType, dup);
949-
Value* pGEP = GetElementPtrInst::Create(operandBase, laneVal, "GEP_lane", LI);
950-
Value* pIndex = BinaryOperator::CreateMul(operand->getOperand(1), elementNumVal, "GEPIndex_s", LI);
951-
pGEP = GetElementPtrInst::Create(pGEP, pIndex, "GEP_s", LI);
952-
newScalarizedInsts[dup] = new LoadInst(pGEP, LI->getName(), LI);
953-
}
954-
#endif
955-
// Add new value/s to SCM
956-
updateSCMEntryWithValues(newEntry, &(newScalarizedInsts[0]), LI, true);
957-
958-
// Remove original instruction
959-
m_removedInsts.insert(LI);
960-
return;
961-
}
962-
return recoverNonScalarizableInst(LI);
963-
}
964-
965-
void ScalarizeFunction::scalarizeInstruction(StoreInst* SI)
966-
{
967-
V_PRINT(scalarizer, "\t\tStore instruction\n");
968-
IGC_ASSERT_MESSAGE(SI, "instruction type dynamic cast failed");
969-
970-
int indexPtr = SI->getPointerOperandIndex();
971-
int indexData = 1 - indexPtr;
972-
VectorType* dataType = dyn_cast<VectorType>(SI->getOperand(indexData)->getType());
973-
if (isScalarizableLoadStoreType(dataType) && m_pDL)
974-
{
975-
// Get additional info from instruction
976-
unsigned int vectorSize = int_cast<unsigned int>(m_pDL->getTypeAllocSize(dataType));
977-
unsigned int elementSize = int_cast<unsigned int>(m_pDL->getTypeSizeInBits(dataType->getElementType()) / 8);
978-
IGC_ASSERT(elementSize);
979-
IGC_ASSERT_MESSAGE((vectorSize / elementSize > 0), "vector size should be a multiply of element size");
980-
IGC_ASSERT_MESSAGE((vectorSize % elementSize == 0), "vector size should be a multiply of element size");
981-
982-
unsigned numDupElements = int_cast<unsigned>(dataType->getNumElements());
983-
984-
// Obtain scalarized arguments
985-
// 1 - to allow scalarizing Load with any pointer type
986-
// 0 - to limit scalarizing to special case where packetizer benifit from the scalarizing
987-
#if 1
988-
SmallVector<Value*, MAX_INPUT_VECTOR_WIDTH>operand0;
989-
990-
bool opIsConst;
991-
obtainScalarizedValues(operand0, &opIsConst, SI->getOperand(indexData), SI);
992-
993-
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
994-
Value* GepPtr = SI->getOperand(indexPtr);
995-
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
996-
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", SI);
997-
Type* indexType = Type::getInt32Ty(*m_moduleContext);
998-
// Generate new (scalar) instructions
999-
for (unsigned dup = 0; dup < numDupElements; dup++)
1000-
{
1001-
Constant* laneVal = ConstantInt::get(indexType, dup);
1002-
Value* pGEP = GetElementPtrInst::Create(nullptr, operandBase, laneVal, "GEP_lane", SI);
1003-
new StoreInst(operand0[dup], pGEP, SI);
1004-
}
1005-
#else
1006-
GetElementPtrInst* operand1 = dyn_cast<GetElementPtrInst>(SI->getOperand(indexPtr));
1007-
if (!operand1 || operand1->getNumIndices() != 1)
1008-
{
1009-
return recoverNonScalarizableInst(SI);
1010-
}
1011-
Value* operand0[MAX_INPUT_VECTOR_WIDTH];
1012-
bool opIsConst;
1013-
obtainScalarizedValues(operand0, &opIsConst, SI->getOperand(indexData), SI);
1014-
1015-
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
1016-
Value* GepPtr = operand1->getPointerOperand();
1017-
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
1018-
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", SI);
1019-
Type* indexType = operand1->getOperand(1)->getType();
1020-
// Generate new (scalar) instructions
1021-
Constant* elementNumVal = ConstantInt::get(indexType, numElements);
1022-
for (unsigned dup = 0; dup < numDupElements; dup++)
1023-
{
1024-
Constant* laneVal = ConstantInt::get(indexType, dup);
1025-
Value* pGEP = GetElementPtrInst::Create(operandBase, laneVal, "GEP_lane", SI);
1026-
Value* pIndex = BinaryOperator::CreateMul(operand1->getOperand(1), elementNumVal, "GEPIndex_s", SI);
1027-
pGEP = GetElementPtrInst::Create(pGEP, pIndex, "GEP_s", SI);
1028-
new StoreInst(operand0[dup], pGEP, SI);
1029-
}
1030-
#endif
1031-
// Remove original instruction
1032-
m_removedInsts.insert(SI);
1033-
return;
1034-
}
1035-
return recoverNonScalarizableInst(SI);
1036-
}
1037-
10381007
void ScalarizeFunction::obtainScalarizedValues(SmallVectorImpl<Value*>& retValues, bool* retIsConstant,
10391008
Value* origValue, Instruction* origInst, int destIdx)
10401009
{
@@ -1411,17 +1380,9 @@ void ScalarizeFunction::resolveDeferredInstructions()
14111380
m_DRL.clear();
14121381
}
14131382

1414-
bool ScalarizeFunction::isScalarizableLoadStoreType(VectorType* type)
1415-
{
1416-
// Scalarize Load/Store worth doing only if:
1417-
// 1. Gather/Scatter are supported
1418-
// 2. Load/Store type is a vector
1419-
return (m_ScalarizingVectorLDSTType && (NULL != type));
1420-
}
1421-
1422-
extern "C" FunctionPass* createScalarizerPass(bool scalarizingVectorLDSTType)
1383+
extern "C" FunctionPass* createScalarizerPass(bool selectiveScalarization)
14231384
{
1424-
return new ScalarizeFunction(scalarizingVectorLDSTType);
1385+
return new ScalarizeFunction(selectiveScalarization);
14251386
}
14261387

14271388

0 commit comments

Comments
 (0)