Skip to content

Commit cfc2144

Browse files
Gang Y Chenigcbot
authored andcommitted
Selective scalarization to reduce unnecessary insert/extract
keep vector-phi for vector coalescing.
1 parent 21dd835 commit cfc2144

File tree

3 files changed

+142
-180
lines changed

3 files changed

+142
-180
lines changed

IGC/AdaptorOCL/UnifyIROCL.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -510,10 +510,10 @@ static void CommonOCLBasedPasses(
510510
mpm.add(createSROAPass());
511511
mpm.add(createIGCInstructionCombiningPass());
512512

513-
// "false" to createScalarizerPass() means that vector load/stores are NOT scalarized
513+
// true means selective scalarization
514514
if (IGC_IS_FLAG_ENABLED(DisableScalarizerGPGPU) == false)
515515
{
516-
mpm.add(createScalarizerPass(false));
516+
mpm.add(createScalarizerPass(true));
517517
}
518518

519519
// Create a dummy kernel to attach the symbol table if necessary

IGC/Compiler/Optimizer/Scalarizer.cpp

Lines changed: 128 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,12 @@ IGC_INITIALIZE_PASS_END(ScalarizeFunction, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG
6969

7070
char ScalarizeFunction::ID = 0;
7171

72-
ScalarizeFunction::ScalarizeFunction(bool scalarizingVectorLDSTType) : FunctionPass(ID)
72+
ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass(ID)
7373
{
7474
initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry());
7575

7676
for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0;
77-
m_ScalarizingVectorLDSTType = scalarizingVectorLDSTType;
77+
m_SelectiveScalarization = selectiveScalarization;
7878

7979
// Initialize SCM buffers and allocation
8080
m_SCMAllocationArray = new SCMEntry[ESTIMATED_INST_NUM];
@@ -121,6 +121,13 @@ bool ScalarizeFunction::runOnFunction(Function& F)
121121
m_SCM.clear();
122122
releaseAllSCMEntries();
123123
m_DRL.clear();
124+
m_Excludes.clear();
125+
126+
// collecting instructions that we want to avoid scalarization
127+
if (m_SelectiveScalarization)
128+
{
129+
buildExclusiveSet();
130+
}
124131

125132
// Scalarization. Iterate over all the instructions
126133
// Always hold the iterator at the instruction following the one being scalarized (so the
@@ -132,7 +139,10 @@ bool ScalarizeFunction::runOnFunction(Function& F)
132139
Instruction* currInst = &*sI;
133140
// Move iterator to next instruction BEFORE scalarizing current instruction
134141
++sI;
135-
dispatchInstructionToScalarize(currInst);
142+
if (!m_Excludes.count(currInst))
143+
{
144+
dispatchInstructionToScalarize(currInst);
145+
}
136146
}
137147

138148
resolveVectorValues();
@@ -161,6 +171,119 @@ bool ScalarizeFunction::runOnFunction(Function& F)
161171
return true;
162172
}
163173

174+
void ScalarizeFunction::buildExclusiveSet()
175+
{
176+
inst_iterator sI = inst_begin(m_currFunc);
177+
inst_iterator sE = inst_end(m_currFunc);
178+
std::vector<llvm::Value*> workset;
179+
while (sI != sE)
180+
{
181+
Instruction* currInst = &*sI;
182+
++sI;
183+
if (CallInst* CI = dyn_cast<CallInst>(currInst))
184+
{
185+
unsigned numOperands = CI->getNumArgOperands();
186+
for (unsigned i = 0; i < numOperands; i++)
187+
{
188+
Value* operand = CI->getArgOperand(i);
189+
if (isa<VectorType>(operand->getType()))
190+
{
191+
workset.push_back(operand);
192+
}
193+
}
194+
}
195+
else if (auto IEI = dyn_cast<InsertElementInst>(currInst))
196+
{
197+
Value* scalarIndexVal = IEI->getOperand(2);
198+
// If the index is not a constant - we cannot statically remove this inst
199+
if (!isa<ConstantInt>(scalarIndexVal)) {
200+
workset.push_back(IEI);
201+
}
202+
}
203+
else if (auto EEI = dyn_cast<ExtractElementInst>(currInst))
204+
{
205+
Value* scalarIndexVal = EEI->getOperand(1);
206+
// If the index is not a constant - we cannot statically remove this inst
207+
if (!isa<ConstantInt>(scalarIndexVal)) {
208+
workset.push_back(EEI->getOperand(0));
209+
}
210+
}
211+
else if (auto STI = dyn_cast<StoreInst>(currInst))
212+
{
213+
auto V = STI->getValueOperand();
214+
if (V->getType()->isVectorTy())
215+
{
216+
workset.push_back(V);
217+
}
218+
}
219+
}
220+
while (!workset.empty())
221+
{
222+
auto Def = workset.back();
223+
workset.pop_back();
224+
if (m_Excludes.count(Def))
225+
{
226+
continue;
227+
}
228+
if (auto IEI = dyn_cast<InsertElementInst>(Def))
229+
{
230+
m_Excludes.insert(IEI);
231+
if (!m_Excludes.count(IEI->getOperand(0)) &&
232+
(isa<PHINode>(IEI->getOperand(0)) ||
233+
isa<ShuffleVectorInst>(IEI->getOperand(0)) ||
234+
isa<InsertElementInst>(IEI->getOperand(0))))
235+
{
236+
workset.push_back(IEI->getOperand(0));
237+
}
238+
}
239+
else if (auto SVI = dyn_cast<ShuffleVectorInst>(Def))
240+
{
241+
m_Excludes.insert(SVI);
242+
if (!m_Excludes.count(SVI->getOperand(0)) &&
243+
(isa<PHINode>(SVI->getOperand(0)) ||
244+
isa<ShuffleVectorInst>(SVI->getOperand(0)) ||
245+
isa<InsertElementInst>(SVI->getOperand(0))))
246+
{
247+
workset.push_back(SVI->getOperand(0));
248+
}
249+
if (!m_Excludes.count(SVI->getOperand(1)) &&
250+
(isa<PHINode>(SVI->getOperand(1)) ||
251+
isa<ShuffleVectorInst>(SVI->getOperand(1)) ||
252+
isa<InsertElementInst>(SVI->getOperand(1))))
253+
{
254+
workset.push_back(SVI->getOperand(1));
255+
}
256+
}
257+
else if (auto PHI = dyn_cast<PHINode>(Def))
258+
{
259+
m_Excludes.insert(PHI);
260+
for (int i = 0, n = PHI->getNumOperands(); i < n; ++i)
261+
if (!m_Excludes.count(PHI->getOperand(i)) &&
262+
(isa<PHINode>(PHI->getOperand(i)) ||
263+
isa<ShuffleVectorInst>(PHI->getOperand(i)) ||
264+
isa<InsertElementInst>(PHI->getOperand(i))))
265+
{
266+
workset.push_back(PHI->getOperand(i));
267+
}
268+
}
269+
else
270+
{
271+
continue;
272+
}
273+
// check use
274+
for (auto U : Def->users())
275+
{
276+
if (!m_Excludes.count(U) &&
277+
(isa<PHINode>(U) ||
278+
isa<ShuffleVectorInst>(U) ||
279+
isa<InsertElementInst>(U)))
280+
{
281+
workset.push_back(U);
282+
}
283+
}
284+
}
285+
}
286+
164287
void ScalarizeFunction::dispatchInstructionToScalarize(Instruction* I)
165288
{
166289
V_PRINT(scalarizer, "\tScalarizing Instruction: " << *I << "\n");
@@ -235,13 +358,6 @@ void ScalarizeFunction::dispatchInstructionToScalarize(Instruction* I)
235358
case Instruction::GetElementPtr:
236359
scalarizeInstruction(dyn_cast<GetElementPtrInst>(I));
237360
break;
238-
case Instruction::Load:
239-
scalarizeInstruction(dyn_cast<LoadInst>(I));
240-
break;
241-
case Instruction::Store:
242-
scalarizeInstruction(dyn_cast<StoreInst>(I));
243-
break;
244-
245361
// The remaining instructions are not supported for scalarization. Keep "as is"
246362
default:
247363
recoverNonScalarizableInst(I);
@@ -892,149 +1008,6 @@ void ScalarizeFunction::scalarizeInstruction(GetElementPtrInst* GI)
8921008
m_removedInsts.insert(GI);
8931009
}
8941010

895-
void ScalarizeFunction::scalarizeInstruction(LoadInst* LI)
896-
{
897-
V_PRINT(scalarizer, "\t\tLoad instruction\n");
898-
IGC_ASSERT_MESSAGE(LI, "instruction type dynamic cast failed");
899-
900-
VectorType* dataType = dyn_cast<VectorType>(LI->getType());
901-
if (isScalarizableLoadStoreType(dataType) && m_pDL)
902-
{
903-
// Prepare empty SCM entry for the instruction
904-
SCMEntry* newEntry = getSCMEntry(LI);
905-
906-
// Get additional info from instruction
907-
unsigned int vectorSize = int_cast<unsigned int>(m_pDL->getTypeAllocSize(dataType));
908-
unsigned int elementSize = int_cast<unsigned int>(m_pDL->getTypeSizeInBits(dataType->getElementType()) / 8);
909-
IGC_ASSERT(elementSize);
910-
IGC_ASSERT_MESSAGE((vectorSize / elementSize > 0), "vector size should be a multiply of element size");
911-
IGC_ASSERT_MESSAGE((vectorSize % elementSize == 0), "vector size should be a multiply of element size");
912-
unsigned numDupElements = int_cast<unsigned>(dataType->getNumElements());
913-
914-
// Obtain scalarized arguments
915-
// 1 - to allow scalarizing Load with any pointer type
916-
// 0 - to limit scalarizing to special case where packetizer benifit from the scalarizing
917-
#if 1
918-
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
919-
Value * GepPtr = LI->getOperand(0);
920-
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
921-
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", LI);
922-
Type* indexType = Type::getInt32Ty(*m_moduleContext);
923-
// Generate new (scalar) instructions
924-
SmallVector<Value*, MAX_INPUT_VECTOR_WIDTH>newScalarizedInsts;
925-
newScalarizedInsts.resize(numDupElements);
926-
for (unsigned dup = 0; dup < numDupElements; dup++)
927-
{
928-
Constant* laneVal = ConstantInt::get(indexType, dup);
929-
Value* pGEP = GetElementPtrInst::Create(nullptr, operandBase, laneVal, "GEP_lane", LI);
930-
newScalarizedInsts[dup] = new LoadInst(pGEP->getType()->getPointerElementType(), pGEP, LI->getName(), LI);
931-
}
932-
#else
933-
GetElementPtrInst* operand = dyn_cast<GetElementPtrInst>(LI->getOperand(0));
934-
if (!operand || operand->getNumIndices() != 1)
935-
{
936-
return recoverNonScalarizableInst(LI);
937-
}
938-
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
939-
Value* GepPtr = operand->getPointerOperand();
940-
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
941-
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", LI);
942-
Type* indexType = operand->getOperand(1)->getType();
943-
// Generate new (scalar) instructions
944-
Value* newScalarizedInsts[MAX_INPUT_VECTOR_WIDTH];
945-
Constant* elementNumVal = ConstantInt::get(indexType, numElements);
946-
for (unsigned dup = 0; dup < numDupElements; dup++)
947-
{
948-
Constant* laneVal = ConstantInt::get(indexType, dup);
949-
Value* pGEP = GetElementPtrInst::Create(operandBase, laneVal, "GEP_lane", LI);
950-
Value* pIndex = BinaryOperator::CreateMul(operand->getOperand(1), elementNumVal, "GEPIndex_s", LI);
951-
pGEP = GetElementPtrInst::Create(pGEP, pIndex, "GEP_s", LI);
952-
newScalarizedInsts[dup] = new LoadInst(pGEP, LI->getName(), LI);
953-
}
954-
#endif
955-
// Add new value/s to SCM
956-
updateSCMEntryWithValues(newEntry, &(newScalarizedInsts[0]), LI, true);
957-
958-
// Remove original instruction
959-
m_removedInsts.insert(LI);
960-
return;
961-
}
962-
return recoverNonScalarizableInst(LI);
963-
}
964-
965-
void ScalarizeFunction::scalarizeInstruction(StoreInst* SI)
966-
{
967-
V_PRINT(scalarizer, "\t\tStore instruction\n");
968-
IGC_ASSERT_MESSAGE(SI, "instruction type dynamic cast failed");
969-
970-
int indexPtr = SI->getPointerOperandIndex();
971-
int indexData = 1 - indexPtr;
972-
VectorType* dataType = dyn_cast<VectorType>(SI->getOperand(indexData)->getType());
973-
if (isScalarizableLoadStoreType(dataType) && m_pDL)
974-
{
975-
// Get additional info from instruction
976-
unsigned int vectorSize = int_cast<unsigned int>(m_pDL->getTypeAllocSize(dataType));
977-
unsigned int elementSize = int_cast<unsigned int>(m_pDL->getTypeSizeInBits(dataType->getElementType()) / 8);
978-
IGC_ASSERT(elementSize);
979-
IGC_ASSERT_MESSAGE((vectorSize / elementSize > 0), "vector size should be a multiply of element size");
980-
IGC_ASSERT_MESSAGE((vectorSize % elementSize == 0), "vector size should be a multiply of element size");
981-
982-
unsigned numDupElements = int_cast<unsigned>(dataType->getNumElements());
983-
984-
// Obtain scalarized arguments
985-
// 1 - to allow scalarizing Load with any pointer type
986-
// 0 - to limit scalarizing to special case where packetizer benifit from the scalarizing
987-
#if 1
988-
SmallVector<Value*, MAX_INPUT_VECTOR_WIDTH>operand0;
989-
990-
bool opIsConst;
991-
obtainScalarizedValues(operand0, &opIsConst, SI->getOperand(indexData), SI);
992-
993-
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
994-
Value* GepPtr = SI->getOperand(indexPtr);
995-
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
996-
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", SI);
997-
Type* indexType = Type::getInt32Ty(*m_moduleContext);
998-
// Generate new (scalar) instructions
999-
for (unsigned dup = 0; dup < numDupElements; dup++)
1000-
{
1001-
Constant* laneVal = ConstantInt::get(indexType, dup);
1002-
Value* pGEP = GetElementPtrInst::Create(nullptr, operandBase, laneVal, "GEP_lane", SI);
1003-
new StoreInst(operand0[dup], pGEP, SI);
1004-
}
1005-
#else
1006-
GetElementPtrInst* operand1 = dyn_cast<GetElementPtrInst>(SI->getOperand(indexPtr));
1007-
if (!operand1 || operand1->getNumIndices() != 1)
1008-
{
1009-
return recoverNonScalarizableInst(SI);
1010-
}
1011-
Value* operand0[MAX_INPUT_VECTOR_WIDTH];
1012-
bool opIsConst;
1013-
obtainScalarizedValues(operand0, &opIsConst, SI->getOperand(indexData), SI);
1014-
1015-
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
1016-
Value* GepPtr = operand1->getPointerOperand();
1017-
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
1018-
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", SI);
1019-
Type* indexType = operand1->getOperand(1)->getType();
1020-
// Generate new (scalar) instructions
1021-
Constant* elementNumVal = ConstantInt::get(indexType, numElements);
1022-
for (unsigned dup = 0; dup < numDupElements; dup++)
1023-
{
1024-
Constant* laneVal = ConstantInt::get(indexType, dup);
1025-
Value* pGEP = GetElementPtrInst::Create(operandBase, laneVal, "GEP_lane", SI);
1026-
Value* pIndex = BinaryOperator::CreateMul(operand1->getOperand(1), elementNumVal, "GEPIndex_s", SI);
1027-
pGEP = GetElementPtrInst::Create(pGEP, pIndex, "GEP_s", SI);
1028-
new StoreInst(operand0[dup], pGEP, SI);
1029-
}
1030-
#endif
1031-
// Remove original instruction
1032-
m_removedInsts.insert(SI);
1033-
return;
1034-
}
1035-
return recoverNonScalarizableInst(SI);
1036-
}
1037-
10381011
void ScalarizeFunction::obtainScalarizedValues(SmallVectorImpl<Value*>& retValues, bool* retIsConstant,
10391012
Value* origValue, Instruction* origInst, int destIdx)
10401013
{
@@ -1411,17 +1384,9 @@ void ScalarizeFunction::resolveDeferredInstructions()
14111384
m_DRL.clear();
14121385
}
14131386

1414-
bool ScalarizeFunction::isScalarizableLoadStoreType(VectorType* type)
1415-
{
1416-
// Scalarize Load/Store worth doing only if:
1417-
// 1. Gather/Scatter are supported
1418-
// 2. Load/Store type is a vector
1419-
return (m_ScalarizingVectorLDSTType && (NULL != type));
1420-
}
1421-
1422-
extern "C" FunctionPass* createScalarizerPass(bool scalarizingVectorLDSTType)
1387+
extern "C" FunctionPass* createScalarizerPass(bool selectiveScalarization)
14231388
{
1424-
return new ScalarizeFunction(scalarizingVectorLDSTType);
1389+
return new ScalarizeFunction(selectiveScalarization);
14251390
}
14261391

14271392

0 commit comments

Comments
 (0)