Skip to content

Commit aa32999

Browse files
Gang Y Chenigcbot
authored andcommitted
[Autobackout][FuncReg]Revert of change: 172c6d2
Make scalarization more selective, avoid cases that phi is scalarized but introduces more moves later on
1 parent 6f87a02 commit aa32999

File tree

3 files changed

+180
-146
lines changed

3 files changed

+180
-146
lines changed

IGC/AdaptorOCL/UnifyIROCL.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -510,10 +510,10 @@ static void CommonOCLBasedPasses(
510510
mpm.add(createSROAPass());
511511
mpm.add(createIGCInstructionCombiningPass());
512512

513-
// true means selective scalarization
513+
// "false" to createScalarizerPass() means that vector load/stores are NOT scalarized
514514
if (IGC_IS_FLAG_ENABLED(DisableScalarizerGPGPU) == false)
515515
{
516-
mpm.add(createScalarizerPass(true));
516+
mpm.add(createScalarizerPass(false));
517517
}
518518

519519
// Create a dummy kernel to attach the symbol table if necessary

IGC/Compiler/Optimizer/Scalarizer.cpp

Lines changed: 163 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,12 @@ IGC_INITIALIZE_PASS_END(ScalarizeFunction, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG
6969

7070
char ScalarizeFunction::ID = 0;
7171

72-
ScalarizeFunction::ScalarizeFunction(bool selectiveScalarization) : FunctionPass(ID)
72+
ScalarizeFunction::ScalarizeFunction(bool scalarizingVectorLDSTType) : FunctionPass(ID)
7373
{
7474
initializeScalarizeFunctionPass(*PassRegistry::getPassRegistry());
7575

7676
for (int i = 0; i < Instruction::OtherOpsEnd; i++) m_transposeCtr[i] = 0;
77-
m_SelectiveScalarization = selectiveScalarization;
77+
m_ScalarizingVectorLDSTType = scalarizingVectorLDSTType;
7878

7979
// Initialize SCM buffers and allocation
8080
m_SCMAllocationArray = new SCMEntry[ESTIMATED_INST_NUM];
@@ -121,13 +121,6 @@ bool ScalarizeFunction::runOnFunction(Function& F)
121121
m_SCM.clear();
122122
releaseAllSCMEntries();
123123
m_DRL.clear();
124-
m_Excludes.clear();
125-
126-
// collecting instructions that we want to avoid scalarization
127-
if (m_SelectiveScalarization)
128-
{
129-
buildExclusiveSet();
130-
}
131124

132125
// Scalarization. Iterate over all the instructions
133126
// Always hold the iterator at the instruction following the one being scalarized (so the
@@ -139,14 +132,7 @@ bool ScalarizeFunction::runOnFunction(Function& F)
139132
Instruction* currInst = &*sI;
140133
// Move iterator to next instruction BEFORE scalarizing current instruction
141134
++sI;
142-
if (m_Excludes.count(currInst))
143-
{
144-
recoverNonScalarizableInst(currInst);
145-
}
146-
else
147-
{
148-
dispatchInstructionToScalarize(currInst);
149-
}
135+
dispatchInstructionToScalarize(currInst);
150136
}
151137

152138
resolveVectorValues();
@@ -175,119 +161,6 @@ bool ScalarizeFunction::runOnFunction(Function& F)
175161
return true;
176162
}
177163

178-
void ScalarizeFunction::buildExclusiveSet()
179-
{
180-
inst_iterator sI = inst_begin(m_currFunc);
181-
inst_iterator sE = inst_end(m_currFunc);
182-
std::vector<llvm::Value*> workset;
183-
while (sI != sE)
184-
{
185-
Instruction* currInst = &*sI;
186-
++sI;
187-
if (CallInst* CI = dyn_cast<CallInst>(currInst))
188-
{
189-
unsigned numOperands = CI->getNumArgOperands();
190-
for (unsigned i = 0; i < numOperands; i++)
191-
{
192-
Value* operand = CI->getArgOperand(i);
193-
if (isa<VectorType>(operand->getType()))
194-
{
195-
workset.push_back(operand);
196-
}
197-
}
198-
}
199-
else if (auto IEI = dyn_cast<InsertElementInst>(currInst))
200-
{
201-
Value* scalarIndexVal = IEI->getOperand(2);
202-
// If the index is not a constant - we cannot statically remove this inst
203-
if (!isa<ConstantInt>(scalarIndexVal)) {
204-
workset.push_back(IEI);
205-
}
206-
}
207-
else if (auto EEI = dyn_cast<ExtractElementInst>(currInst))
208-
{
209-
Value* scalarIndexVal = EEI->getOperand(1);
210-
// If the index is not a constant - we cannot statically remove this inst
211-
if (!isa<ConstantInt>(scalarIndexVal)) {
212-
workset.push_back(EEI->getOperand(0));
213-
}
214-
}
215-
else if (auto STI = dyn_cast<StoreInst>(currInst))
216-
{
217-
auto V = STI->getValueOperand();
218-
if (V->getType()->isVectorTy())
219-
{
220-
workset.push_back(V);
221-
}
222-
}
223-
}
224-
while (!workset.empty())
225-
{
226-
auto Def = workset.back();
227-
workset.pop_back();
228-
if (m_Excludes.count(Def))
229-
{
230-
continue;
231-
}
232-
if (auto IEI = dyn_cast<InsertElementInst>(Def))
233-
{
234-
m_Excludes.insert(IEI);
235-
if (!m_Excludes.count(IEI->getOperand(0)) &&
236-
(isa<PHINode>(IEI->getOperand(0)) ||
237-
isa<ShuffleVectorInst>(IEI->getOperand(0)) ||
238-
isa<InsertElementInst>(IEI->getOperand(0))))
239-
{
240-
workset.push_back(IEI->getOperand(0));
241-
}
242-
}
243-
else if (auto SVI = dyn_cast<ShuffleVectorInst>(Def))
244-
{
245-
m_Excludes.insert(SVI);
246-
if (!m_Excludes.count(SVI->getOperand(0)) &&
247-
(isa<PHINode>(SVI->getOperand(0)) ||
248-
isa<ShuffleVectorInst>(SVI->getOperand(0)) ||
249-
isa<InsertElementInst>(SVI->getOperand(0))))
250-
{
251-
workset.push_back(SVI->getOperand(0));
252-
}
253-
if (!m_Excludes.count(SVI->getOperand(1)) &&
254-
(isa<PHINode>(SVI->getOperand(1)) ||
255-
isa<ShuffleVectorInst>(SVI->getOperand(1)) ||
256-
isa<InsertElementInst>(SVI->getOperand(1))))
257-
{
258-
workset.push_back(SVI->getOperand(1));
259-
}
260-
}
261-
else if (auto PHI = dyn_cast<PHINode>(Def))
262-
{
263-
m_Excludes.insert(PHI);
264-
for (int i = 0, n = PHI->getNumOperands(); i < n; ++i)
265-
if (!m_Excludes.count(PHI->getOperand(i)) &&
266-
(isa<PHINode>(PHI->getOperand(i)) ||
267-
isa<ShuffleVectorInst>(PHI->getOperand(i)) ||
268-
isa<InsertElementInst>(PHI->getOperand(i))))
269-
{
270-
workset.push_back(PHI->getOperand(i));
271-
}
272-
}
273-
else
274-
{
275-
continue;
276-
}
277-
// check use
278-
for (auto U : Def->users())
279-
{
280-
if (!m_Excludes.count(U) &&
281-
(isa<PHINode>(U) ||
282-
isa<ShuffleVectorInst>(U) ||
283-
isa<InsertElementInst>(U)))
284-
{
285-
workset.push_back(U);
286-
}
287-
}
288-
}
289-
}
290-
291164
void ScalarizeFunction::dispatchInstructionToScalarize(Instruction* I)
292165
{
293166
V_PRINT(scalarizer, "\tScalarizing Instruction: " << *I << "\n");
@@ -362,6 +235,13 @@ void ScalarizeFunction::dispatchInstructionToScalarize(Instruction* I)
362235
case Instruction::GetElementPtr:
363236
scalarizeInstruction(dyn_cast<GetElementPtrInst>(I));
364237
break;
238+
case Instruction::Load:
239+
scalarizeInstruction(dyn_cast<LoadInst>(I));
240+
break;
241+
case Instruction::Store:
242+
scalarizeInstruction(dyn_cast<StoreInst>(I));
243+
break;
244+
365245
// The remaining instructions are not supported for scalarization. Keep "as is"
366246
default:
367247
recoverNonScalarizableInst(I);
@@ -1012,6 +892,149 @@ void ScalarizeFunction::scalarizeInstruction(GetElementPtrInst* GI)
1012892
m_removedInsts.insert(GI);
1013893
}
1014894

895+
void ScalarizeFunction::scalarizeInstruction(LoadInst* LI)
896+
{
897+
V_PRINT(scalarizer, "\t\tLoad instruction\n");
898+
IGC_ASSERT_MESSAGE(LI, "instruction type dynamic cast failed");
899+
900+
VectorType* dataType = dyn_cast<VectorType>(LI->getType());
901+
if (isScalarizableLoadStoreType(dataType) && m_pDL)
902+
{
903+
// Prepare empty SCM entry for the instruction
904+
SCMEntry* newEntry = getSCMEntry(LI);
905+
906+
// Get additional info from instruction
907+
unsigned int vectorSize = int_cast<unsigned int>(m_pDL->getTypeAllocSize(dataType));
908+
unsigned int elementSize = int_cast<unsigned int>(m_pDL->getTypeSizeInBits(dataType->getElementType()) / 8);
909+
IGC_ASSERT(elementSize);
910+
IGC_ASSERT_MESSAGE((vectorSize / elementSize > 0), "vector size should be a multiply of element size");
911+
IGC_ASSERT_MESSAGE((vectorSize % elementSize == 0), "vector size should be a multiply of element size");
912+
unsigned numDupElements = int_cast<unsigned>(dataType->getNumElements());
913+
914+
// Obtain scalarized arguments
915+
// 1 - to allow scalarizing Load with any pointer type
916+
// 0 - to limit scalarizing to special case where packetizer benifit from the scalarizing
917+
#if 1
918+
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
919+
Value * GepPtr = LI->getOperand(0);
920+
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
921+
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", LI);
922+
Type* indexType = Type::getInt32Ty(*m_moduleContext);
923+
// Generate new (scalar) instructions
924+
SmallVector<Value*, MAX_INPUT_VECTOR_WIDTH>newScalarizedInsts;
925+
newScalarizedInsts.resize(numDupElements);
926+
for (unsigned dup = 0; dup < numDupElements; dup++)
927+
{
928+
Constant* laneVal = ConstantInt::get(indexType, dup);
929+
Value* pGEP = GetElementPtrInst::Create(nullptr, operandBase, laneVal, "GEP_lane", LI);
930+
newScalarizedInsts[dup] = new LoadInst(pGEP->getType()->getPointerElementType(), pGEP, LI->getName(), LI);
931+
}
932+
#else
933+
GetElementPtrInst* operand = dyn_cast<GetElementPtrInst>(LI->getOperand(0));
934+
if (!operand || operand->getNumIndices() != 1)
935+
{
936+
return recoverNonScalarizableInst(LI);
937+
}
938+
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
939+
Value* GepPtr = operand->getPointerOperand();
940+
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
941+
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", LI);
942+
Type* indexType = operand->getOperand(1)->getType();
943+
// Generate new (scalar) instructions
944+
Value* newScalarizedInsts[MAX_INPUT_VECTOR_WIDTH];
945+
Constant* elementNumVal = ConstantInt::get(indexType, numElements);
946+
for (unsigned dup = 0; dup < numDupElements; dup++)
947+
{
948+
Constant* laneVal = ConstantInt::get(indexType, dup);
949+
Value* pGEP = GetElementPtrInst::Create(operandBase, laneVal, "GEP_lane", LI);
950+
Value* pIndex = BinaryOperator::CreateMul(operand->getOperand(1), elementNumVal, "GEPIndex_s", LI);
951+
pGEP = GetElementPtrInst::Create(pGEP, pIndex, "GEP_s", LI);
952+
newScalarizedInsts[dup] = new LoadInst(pGEP, LI->getName(), LI);
953+
}
954+
#endif
955+
// Add new value/s to SCM
956+
updateSCMEntryWithValues(newEntry, &(newScalarizedInsts[0]), LI, true);
957+
958+
// Remove original instruction
959+
m_removedInsts.insert(LI);
960+
return;
961+
}
962+
return recoverNonScalarizableInst(LI);
963+
}
964+
965+
void ScalarizeFunction::scalarizeInstruction(StoreInst* SI)
966+
{
967+
V_PRINT(scalarizer, "\t\tStore instruction\n");
968+
IGC_ASSERT_MESSAGE(SI, "instruction type dynamic cast failed");
969+
970+
int indexPtr = SI->getPointerOperandIndex();
971+
int indexData = 1 - indexPtr;
972+
VectorType* dataType = dyn_cast<VectorType>(SI->getOperand(indexData)->getType());
973+
if (isScalarizableLoadStoreType(dataType) && m_pDL)
974+
{
975+
// Get additional info from instruction
976+
unsigned int vectorSize = int_cast<unsigned int>(m_pDL->getTypeAllocSize(dataType));
977+
unsigned int elementSize = int_cast<unsigned int>(m_pDL->getTypeSizeInBits(dataType->getElementType()) / 8);
978+
IGC_ASSERT(elementSize);
979+
IGC_ASSERT_MESSAGE((vectorSize / elementSize > 0), "vector size should be a multiply of element size");
980+
IGC_ASSERT_MESSAGE((vectorSize % elementSize == 0), "vector size should be a multiply of element size");
981+
982+
unsigned numDupElements = int_cast<unsigned>(dataType->getNumElements());
983+
984+
// Obtain scalarized arguments
985+
// 1 - to allow scalarizing Load with any pointer type
986+
// 0 - to limit scalarizing to special case where packetizer benifit from the scalarizing
987+
#if 1
988+
SmallVector<Value*, MAX_INPUT_VECTOR_WIDTH>operand0;
989+
990+
bool opIsConst;
991+
obtainScalarizedValues(operand0, &opIsConst, SI->getOperand(indexData), SI);
992+
993+
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
994+
Value* GepPtr = SI->getOperand(indexPtr);
995+
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
996+
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", SI);
997+
Type* indexType = Type::getInt32Ty(*m_moduleContext);
998+
// Generate new (scalar) instructions
999+
for (unsigned dup = 0; dup < numDupElements; dup++)
1000+
{
1001+
Constant* laneVal = ConstantInt::get(indexType, dup);
1002+
Value* pGEP = GetElementPtrInst::Create(nullptr, operandBase, laneVal, "GEP_lane", SI);
1003+
new StoreInst(operand0[dup], pGEP, SI);
1004+
}
1005+
#else
1006+
GetElementPtrInst* operand1 = dyn_cast<GetElementPtrInst>(SI->getOperand(indexPtr));
1007+
if (!operand1 || operand1->getNumIndices() != 1)
1008+
{
1009+
return recoverNonScalarizableInst(SI);
1010+
}
1011+
Value* operand0[MAX_INPUT_VECTOR_WIDTH];
1012+
bool opIsConst;
1013+
obtainScalarizedValues(operand0, &opIsConst, SI->getOperand(indexData), SI);
1014+
1015+
// Apply the bit-cast on the GEP base and add base-offset then fix the index by multiply it with numElements. (assuming one index only).
1016+
Value* GepPtr = operand1->getPointerOperand();
1017+
PointerType* GepPtrType = cast<PointerType>(GepPtr->getType());
1018+
Value* operandBase = BitCastInst::CreatePointerCast(GepPtr, dataType->getScalarType()->getPointerTo(GepPtrType->getAddressSpace()), "ptrVec2ptrScl", SI);
1019+
Type* indexType = operand1->getOperand(1)->getType();
1020+
// Generate new (scalar) instructions
1021+
Constant* elementNumVal = ConstantInt::get(indexType, numElements);
1022+
for (unsigned dup = 0; dup < numDupElements; dup++)
1023+
{
1024+
Constant* laneVal = ConstantInt::get(indexType, dup);
1025+
Value* pGEP = GetElementPtrInst::Create(operandBase, laneVal, "GEP_lane", SI);
1026+
Value* pIndex = BinaryOperator::CreateMul(operand1->getOperand(1), elementNumVal, "GEPIndex_s", SI);
1027+
pGEP = GetElementPtrInst::Create(pGEP, pIndex, "GEP_s", SI);
1028+
new StoreInst(operand0[dup], pGEP, SI);
1029+
}
1030+
#endif
1031+
// Remove original instruction
1032+
m_removedInsts.insert(SI);
1033+
return;
1034+
}
1035+
return recoverNonScalarizableInst(SI);
1036+
}
1037+
10151038
void ScalarizeFunction::obtainScalarizedValues(SmallVectorImpl<Value*>& retValues, bool* retIsConstant,
10161039
Value* origValue, Instruction* origInst, int destIdx)
10171040
{
@@ -1388,9 +1411,17 @@ void ScalarizeFunction::resolveDeferredInstructions()
13881411
m_DRL.clear();
13891412
}
13901413

1391-
extern "C" FunctionPass* createScalarizerPass(bool selectiveScalarization)
1414+
bool ScalarizeFunction::isScalarizableLoadStoreType(VectorType* type)
1415+
{
1416+
// Scalarize Load/Store worth doing only if:
1417+
// 1. Gather/Scatter are supported
1418+
// 2. Load/Store type is a vector
1419+
return (m_ScalarizingVectorLDSTType && (NULL != type));
1420+
}
1421+
1422+
extern "C" FunctionPass* createScalarizerPass(bool scalarizingVectorLDSTType)
13921423
{
1393-
return new ScalarizeFunction(selectiveScalarization);
1424+
return new ScalarizeFunction(scalarizingVectorLDSTType);
13941425
}
13951426

13961427

0 commit comments

Comments
 (0)