Skip to content

Commit 300ed9f

Browse files
esukhovfda0
authored andcommitted
Add uniformity check for CloneAddressArithmetic
This commit introduces new functionality and control flags for CloneAddressArithmetic pass and adds a uniformity check for rematerialization (disabled by default). (cherry picked from commit 4707787)
1 parent f8b5e7c commit 300ed9f

File tree

3 files changed

+139
-92
lines changed

3 files changed

+139
-92
lines changed

IGC/Compiler/CISACodeGen/RematAddressArithmetic.cpp

Lines changed: 100 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class CloneAddressArithmetic : public FunctionPass {
5151

5252
public:
5353
static char ID;
54+
WIAnalysis *WI = nullptr;
5455

5556
~CloneAddressArithmetic() { Uses.clear(); }
5657
CloneAddressArithmetic() : FunctionPass(ID) {
@@ -62,22 +63,19 @@ class CloneAddressArithmetic : public FunctionPass {
6263
virtual void getAnalysisUsage(llvm::AnalysisUsage& AU) const override {
6364
AU.setPreservesCFG();
6465
AU.addRequired<IGCLivenessAnalysis>();
66+
AU.addRequired<WIAnalysis>();
6567
}
6668

6769
bool runOnFunction(Function&) override;
6870
void rematWholeChain(llvm::IntToPtrInst *I);
71+
bool isRegPressureLow(Function &F);
6972
std::unordered_map<llvm::Value*, unsigned int> Uses;
7073

7174
private:
7275
bool greedyRemat(Function &F);
7376
};
74-
75-
76-
77-
7877
} // end namespace
7978

80-
8179
FunctionPass* IGC::createCloneAddressArithmeticPass() {
8280
return new CloneAddressArithmetic();
8381
}
@@ -91,17 +89,50 @@ char CloneAddressArithmetic::ID = 0;
9189
namespace IGC {
9290
IGC_INITIALIZE_PASS_BEGIN(CloneAddressArithmetic, PASS_FLAG_2, PASS_DESC_2, PASS_CFG_ONLY_2, PASS_ANALYSIS_2)
9391
IGC_INITIALIZE_PASS_DEPENDENCY(IGCLivenessAnalysis)
92+
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
9493
IGC_INITIALIZE_PASS_END(CloneAddressArithmetic, PASS_FLAG_2, PASS_DESC_2, PASS_CFG_ONLY_2, PASS_ANALYSIS_2)
9594
}
9695

96+
void putAddrSpaceCastClose(Function &F) {
97+
98+
std::vector<llvm::Instruction *> V;
99+
100+
for (BasicBlock &BB : F) {
101+
for (auto &I : BB) {
102+
if (llvm::isa<AddrSpaceCastInst>(I))
103+
V.push_back(&I);
104+
}
105+
}
106+
107+
for(auto I : V) {
108+
109+
std::vector<llvm::Use *> VectorOfUses;
110+
for(auto &U : I->uses()) { VectorOfUses.push_back(&U); }
111+
112+
for(auto U : VectorOfUses) {
113+
auto User = U->getUser();
114+
auto UserInst = llvm::dyn_cast<Instruction>(User);
115+
116+
if(llvm::isa<PHINode>(UserInst)) continue;
117+
118+
if(UserInst) {
119+
auto Clone = I->clone();
120+
Clone->setName(I->getName() + "_clonedAddrSpaceCast");
121+
Clone->insertBefore(UserInst);
122+
*U = Clone;
123+
}
124+
}
125+
}
126+
}
97127

98128
static bool isAddressArithmetic(Instruction* I)
99129
{
100130
if (isa<GetElementPtrInst>(I) ||
101131
isa<InsertElementInst>(I) ||
102132
isa<InsertValueInst>(I) ||
133+
isa<BinaryOperator>(I) ||
103134
(isa<UnaryInstruction>(I) && !isa<LoadInst>(I)) ||
104-
isa<BinaryOperator>(I))
135+
(IGC_GET_FLAG_VALUE(RematAllowExtractElement) && isa<ExtractElementInst>(I)))
105136
return true;
106137

107138
return false;
@@ -130,6 +161,7 @@ void CloneAddressArithmetic::rematWholeChain(llvm::IntToPtrInst *I) {
130161
bool NotPHI = !llvm::isa<llvm::PHINode>(Op);
131162
bool NotConstant = !llvm::isa<llvm::Constant>(Op);
132163
bool SameBB = IGC_IS_FLAG_ENABLED(RematSameBBScope) ? Op->getParent() == I->getParent() : true;
164+
bool NotUniform = IGC_IS_FLAG_ENABLED(RematRespectUniformity) ? !WI->isUniform(Op) : true;
133165
bool AddressArithmetic = isAddressArithmetic(Op);
134166

135167
// if operand has more uses than specified, we do not rematerialize it.
@@ -145,7 +177,7 @@ void CloneAddressArithmetic::rematWholeChain(llvm::IntToPtrInst *I) {
145177
// load r2
146178
bool NotTooManyUses = Uses[Op] < NumOfUsesLimit;
147179

148-
if (SameBB && NotConstant && NotPHI && NotTooManyUses && AddressArithmetic) {
180+
if (SameBB && NotConstant && NotPHI && NotTooManyUses && AddressArithmetic && NotUniform) {
149181

150182
BFSQ.push(Op);
151183
RematVector.push_back(Op);
@@ -183,80 +215,92 @@ void CloneAddressArithmetic::rematWholeChain(llvm::IntToPtrInst *I) {
183215
RematVector.clear();
184216
}
185217

186-
bool CloneAddressArithmetic::greedyRemat(Function &F) {
218+
bool isSafelyRematerializable(Use& Use) {
187219

188-
bool Result = false;
220+
auto LI = llvm::isa<LoadInst>(Use.getUser());
221+
auto SI = llvm::isa<StoreInst>(Use.getUser());
222+
auto BI = llvm::isa<BitCastInst>(Use.getUser());
223+
auto SelI = llvm::isa<SelectInst>(Use.getUser());
224+
auto CI = IGC_IS_FLAG_ENABLED(RematAddrSpaceCastToUse) ? llvm::isa<AddrSpaceCastInst>(Use.getUser()) : false;
189225

190-
auto RPE = &getAnalysis<IGCLivenessAnalysis>();
191-
unsigned int SIMD = numLanes(RPE->bestGuessSIMDSize());
192-
unsigned int PressureLimit = IGC_GET_FLAG_VALUE(RematRPELimit);
193-
if(RPE->getMaxRegCountForFunction(F, SIMD) < PressureLimit)
194-
return Result;
226+
bool Result = LI || SI || BI || CI || SelI;
227+
return Result;
228+
}
195229

196-
for (BasicBlock &BB : F) {
197-
for (auto &I : BB) { Uses[&I] = I.getNumUses(); }
198-
}
230+
bool CloneAddressArithmetic::isRegPressureLow(Function &F) {
199231

200-
llvm::SmallVector<llvm::IntToPtrInst *, 4> ToProcess;
232+
auto RPE = &getAnalysis<IGCLivenessAnalysis>();
233+
unsigned int SIMD = numLanes(RPE->bestGuessSIMDSize());
234+
unsigned int PressureLimit = IGC_GET_FLAG_VALUE(RematRPELimit);
235+
bool Result = RPE->getMaxRegCountForFunction(F, SIMD) < PressureLimit;
236+
return Result;
237+
}
201238

202-
// go through block, collect all inttoptr instructions to do
203-
// remat on them
204-
for (BasicBlock &BB : F) {
205-
// if block has less than required amount of LLVM IR instructions, skip it
206-
const unsigned Limit = IGC_GET_FLAG_VALUE(RematBlockSize);
207-
if (BB.getInstList().size() < Limit) continue;
239+
bool CloneAddressArithmetic::greedyRemat(Function &F) {
208240

209-
for (auto &I : BB) {
241+
bool Result = false;
210242

211-
auto *CastedIntToPtrInst = llvm::dyn_cast<IntToPtrInst>(&I);
212-
if (CastedIntToPtrInst) ToProcess.push_back(CastedIntToPtrInst);
213-
}
214-
}
243+
if (isRegPressureLow(F))
244+
return Result;
215245

216-
for (auto el : ToProcess) {
246+
for (BasicBlock &BB : F) {
247+
for (auto &I : BB) { Uses[&I] = I.getNumUses(); }
248+
}
217249

218-
Value *V = el;
219-
llvm::SmallVector<llvm::Use*, 4> VectorOfUses;
220-
// collect all uses of particular intoptr inst
221-
bool usedOnlyInLoadOrStore = true;
222-
for (auto &use : V->uses()) {
250+
// At times, addrspace casts end up far away from their direct users.
251+
if(IGC_IS_FLAG_ENABLED(RematMoveAddrSpaceCast)) putAddrSpaceCastClose(F);
223252

224-
// check that this inttoptr instruction only used in load or stores
225-
auto LI = llvm::dyn_cast<LoadInst>(use.getUser());
226-
auto SI = llvm::dyn_cast<StoreInst>(use.getUser());
227-
usedOnlyInLoadOrStore &= (LI != NULL) || (SI != NULL);
253+
llvm::SmallVector<llvm::IntToPtrInst *, 4> ToProcess;
228254

229-
VectorOfUses.push_back(&use);
255+
// go through block, collect all inttoptr instructions to do
256+
// remat on them
257+
for (BasicBlock &BB : F) {
258+
for (auto &I : BB) {
259+
auto *CastedIntToPtrInst = llvm::dyn_cast<IntToPtrInst>(&I);
260+
if (CastedIntToPtrInst) ToProcess.push_back(CastedIntToPtrInst);
261+
}
230262
}
231263

232-
if(!usedOnlyInLoadOrStore) continue;
264+
for (auto el : ToProcess) {
265+
266+
Value *V = el;
267+
llvm::SmallVector<llvm::Use*, 4> VectorOfUses;
268+
// collect all uses of particular intoptr inst
269+
bool ShouldBeRemated = true;
270+
for (auto &U : V->uses()) {
271+
ShouldBeRemated &= isSafelyRematerializable(U);
272+
VectorOfUses.push_back(&U);
273+
}
233274

234-
for (auto use : VectorOfUses) {
275+
if(!ShouldBeRemated) continue;
235276

236-
// take use of inttoptr instruction, clone instruction,
237-
// insert clone right before the use, swap use to clone, remat
238-
auto User = use->getUser();
239-
auto UserInst = llvm::dyn_cast<Instruction>(User);
277+
for (auto use : VectorOfUses) {
240278

241-
if(UserInst) {
242-
auto Clone = el->clone();
243-
Clone->setName("cloned_" + el->getName());
244-
Clone->insertBefore(UserInst);
245-
*use = Clone;
246-
rematWholeChain((llvm::IntToPtrInst *)Clone);
247-
Result = true;
248-
}
279+
// Clone inttoptr instruction, insert clone right before the use,
280+
// switch use to clone, remat
281+
auto User = use->getUser();
282+
auto UserInst = llvm::dyn_cast<Instruction>(User);
283+
284+
if(UserInst) {
285+
auto Clone = el->clone();
286+
Clone->setName("cloned_" + el->getName());
287+
Clone->insertBefore(UserInst);
288+
*use = Clone;
289+
rematWholeChain((llvm::IntToPtrInst *)Clone);
290+
Result = true;
291+
}
292+
}
249293
}
250-
}
251294

252-
return Result;
295+
return Result;
253296
}
254297

255298
bool CloneAddressArithmetic::runOnFunction(Function& F)
256299
{
257300
if (skipFunction(F))
258301
return false;
259302

303+
WI = &getAnalysis<WIAnalysis>();
260304
bool Modified = false;
261305
Modified |= greedyRemat(F);
262306
return Modified;

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 33 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -894,42 +894,40 @@ void AddLegalizationPasses(CodeGenContext& ctx, IGCPassManager& mpm, PSSignature
894894

895895
// Run address remat after GVN as it may hoist address calculations and
896896
// create PHI nodes with addresses.
897+
if (IGC_IS_FLAG_ENABLED(RematEnable) || (ctx.m_retryManager.AllowCloneAddressArithmetic() && ctx.type == ShaderType::OPENCL_SHADER)) {
898+
899+
if(IGC_GET_FLAG_VALUE(RematInstCombineBefore)) mpm.add(createIGCInstructionCombiningPass());
900+
// TODO: This is a workaround that helps to reduce amount of instructions for clone address arithmetic
901+
// it helps with chain of instructions like this
902+
// %remat12 = add i64 %baseArith, 100780848
903+
// %remat13 = add i64 %remat12, %basePtr
904+
// %remat14 = add i64 %remat13, %offsetI
905+
// %remat15 = add i64 %remat14, %offsetJ
906+
// load ...
907+
// ....
908+
// %remat21 = add i64 %baseArith, 201561696
909+
// %remat22 = add i64 %remat21, %basePtr
910+
// %remat23 = add i64 %remat22, %offsetI
911+
// %remat24 = add i64 %remat23, %offsetJ
912+
// load ...
913+
// we can compress this chain of instruction into one "add" for each "load"
914+
// this is achieved by combining reassoc + cse 3 times (each pair hoists one add)
915+
// it should be substituted for general pass when it's implemented
916+
//
917+
// Now it's accessible through flag, for testing purposes
918+
if (IGC_GET_FLAG_VALUE(RematReassocBefore)) {
919+
mpm.add(llvm::createReassociatePass());
920+
mpm.add(llvm::createEarlyCSEPass());
921+
mpm.add(llvm::createReassociatePass());
922+
mpm.add(llvm::createEarlyCSEPass());
923+
mpm.add(llvm::createReassociatePass());
924+
mpm.add(llvm::createEarlyCSEPass());
925+
}
897926

898-
if (IGC_IS_FLAG_ENABLED(EnableRemat) || (ctx.m_retryManager.AllowCloneAddressArithmetic() && ctx.type == ShaderType::OPENCL_SHADER)) {
899-
900-
901-
// TODO: This is a workaround that helps to reduce amount of instructions for clone address arithmetic
902-
// it helps with chain of instructions like this
903-
// %remat12 = add i64 %baseArith, 100780848
904-
// %remat13 = add i64 %remat12, %basePtr
905-
// %remat14 = add i64 %remat13, %offsetI
906-
// %remat15 = add i64 %remat14, %offsetJ
907-
// load ...
908-
// ....
909-
// %remat21 = add i64 %baseArith, 201561696
910-
// %remat22 = add i64 %remat21, %basePtr
911-
// %remat23 = add i64 %remat22, %offsetI
912-
// %remat24 = add i64 %remat23, %offsetJ
913-
// load ...
914-
// we can compress this chain of instruction into one "add" for each "load"
915-
// this is achieved by combining reassoc + cse 3 times (each pair hoists one add)
916-
// it should be substituted for general pass when it's implemented
917-
//
918-
// Now it's accessible through flag, for testing purposes
919-
920-
if (IGC_GET_FLAG_VALUE(RematReassocBefore)) {
921-
mpm.add(llvm::createReassociatePass());
922-
mpm.add(llvm::createEarlyCSEPass());
923-
mpm.add(llvm::createReassociatePass());
924-
mpm.add(llvm::createEarlyCSEPass());
925-
mpm.add(llvm::createReassociatePass());
926-
mpm.add(llvm::createEarlyCSEPass());
927-
}
928-
929-
mpm.add(createCloneAddressArithmeticPass());
930-
// cloneAddressArithmetic leaves old instructions unnecessary
931-
// dce pass helps to clean that up
932-
mpm.add(createDeadCodeEliminationPass());
927+
mpm.add(createCloneAddressArithmeticPass());
928+
// cloneAddressArithmetic leaves old instructions unnecessary
929+
// dce pass helps to clean that up
930+
mpm.add(createDeadCodeEliminationPass());
933931
}
934932

935933
mpm.add(createRematAddressArithmeticPass());

IGC/common/igc_flags.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -737,9 +737,14 @@ DECLARE_IGC_REGKEY(DWORD, RematBlockSize, 10, "Represents a threshold for a b
737737
DECLARE_IGC_REGKEY(DWORD, RematUsesThreshold, 5, "Amount of uses after which operand is not rematerialized", false)
738738
DECLARE_IGC_REGKEY(DWORD, RematChainLimit, 10, "If number of instructions we've collected is more than this value, we bail on it", false)
739739
DECLARE_IGC_REGKEY(DWORD, RematRPELimit, 100, "Cutoff value for register estimator, lower than that, kernel won't be rematted", false)
740+
DECLARE_IGC_REGKEY(bool, RematEnable, false, "Enable clone adress arithmetic pass not only on retry", false)
741+
DECLARE_IGC_REGKEY(bool, RematMoveAddrSpaceCast, false, "Enable clone adress arithmetic pass not only on retry", false)
740742
DECLARE_IGC_REGKEY(bool, RematSameBBScope, true, "Confine rematerialization only to variables within the same BB, we won't pull down values from predeccors", false)
741-
DECLARE_IGC_REGKEY(bool, EnableRemat, false, "Enable clone adress arithmetic pass not only on retry", false)
743+
DECLARE_IGC_REGKEY(bool, RematRespectUniformity, false, "Cutoff computation chain on uniform values", false)
744+
DECLARE_IGC_REGKEY(bool, RematAllowExtractElement, false, "Allow Extract Element to computation chain", false)
742745
DECLARE_IGC_REGKEY(bool, RematReassocBefore, false, "Enable short sequence of passes before clone address arithmetic pass to potentially decrese amount of operations that will be rematerialized", false)
746+
DECLARE_IGC_REGKEY(bool, RematInstCombineBefore, false, "Enable short sequence of passes before clone address arithmetic pass to potentially decrese amount of operations that will be rematerialized", false)
747+
DECLARE_IGC_REGKEY(bool, RematAddrSpaceCastToUse, false, "Allow rematerialization of inttoptr that are used inside AddrSpaceCastInst", false)
743748
DECLARE_IGC_REGKEY(bool, DumpRegPressureEstimate, false, "Dump RegPressureEstimate to a file", false)
744749
DECLARE_IGC_REGKEY(debugString, DumpRegPressureEstimateFilter, 0, "Only dump RegPressureEstimate for functions matching the given regex", false)
745750
DECLARE_IGC_REGKEY(bool, EnableReusingXYZWStoreConstPayload, true, "Enable reusing XYZW stores const payload", false)

0 commit comments

Comments
 (0)