Skip to content

Commit 45f08b2

Browse files
ppogotovigcbot
authored andcommitted
Scalarization of address calculations for block memory operations.
This optimization scalarizes address calculations for block memory operations by broadcasting them.
1 parent bfd1d18 commit 45f08b2

File tree

8 files changed

+472
-0
lines changed

8 files changed

+472
-0
lines changed
Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2023 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
#include "BlockMemOpAddrScalarizationPass.hpp"
10+
#include "CodeGenPublicEnums.h"
11+
#include "IGCIRBuilder.h"
12+
#include <llvm/IR/Function.h>
13+
14+
#include "Compiler/IGCPassSupport.h"
15+
#include "Compiler/CISACodeGen/helper.h"
16+
17+
#include "common/LLVMWarningsPush.hpp"
18+
#include "common/LLVMWarningsPop.hpp"
19+
20+
using namespace llvm;
21+
using namespace IGC;
22+
23+
char BlockMemOpAddrScalarizationPass::ID = 0;
24+
25+
#define PASS_FLAG "block-memop-addr-scalar"
26+
#define PASS_DESCRIPTION "Scalarization of address calculations for block memory operations."
27+
#define PASS_CFG_ONLY false
28+
#define PASS_ANALYSIS false
29+
IGC_INITIALIZE_PASS_BEGIN(BlockMemOpAddrScalarizationPass, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
30+
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
31+
IGC_INITIALIZE_PASS_END(BlockMemOpAddrScalarizationPass, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
32+
33+
BlockMemOpAddrScalarizationPass::BlockMemOpAddrScalarizationPass() : FunctionPass(ID) {
34+
initializeBlockMemOpAddrScalarizationPassPass(*PassRegistry::getPassRegistry());
35+
}
36+
37+
bool BlockMemOpAddrScalarizationPass::runOnFunction(Function &F) {
38+
Changed = false;
39+
WI = &getAnalysis<WIAnalysis>();
40+
visit(F);
41+
InstCanBeScalarized.clear();
42+
ExistingBroadcasts.clear();
43+
return Changed;
44+
}
45+
46+
void BlockMemOpAddrScalarizationPass::visitCallInst(CallInst& C) {
47+
if (GenIntrinsicInst *I = dyn_cast<GenIntrinsicInst>(&C)) {
48+
GenISAIntrinsic::ID id = I->getIntrinsicID();
49+
if (id == GenISAIntrinsic::GenISA_simdBlockRead || id == GenISAIntrinsic::GenISA_simdBlockWrite)
50+
scalarizeAddrArithmForBlockRdWr(I);
51+
}
52+
}
53+
54+
// This function checks if InstForCheck can be scalarized.
55+
bool BlockMemOpAddrScalarizationPass::canInstBeScalarized(Instruction *InstForCheck, Instruction *Root) {
56+
if (checkInst(InstForCheck) != InstType::CanBeScalar)
57+
return false;
58+
59+
bool GotFinalInst = false;
60+
for (auto Op = InstForCheck->op_begin(), E = InstForCheck->op_end(); Op != E; Op++) {
61+
if (Instruction *IOp = dyn_cast<Instruction>(Op)) {
62+
GotFinalInst = true;
63+
// Don't process any vector instructions.
64+
if (IOp->getType()->isVectorTy())
65+
return false;
66+
}
67+
}
68+
69+
// If InstForCheck does not have any instruction operands, scalarize its result which is used in Root instruction.
70+
if (!GotFinalInst)
71+
return false;
72+
73+
// This check showes that InstForCheck is used only the address calculation chain.
74+
if (InstForCheck->getNumUses() == 1)
75+
return true;
76+
77+
SmallVector<std::tuple<Instruction*, Instruction*, bool>, 32> UseStack;
78+
SmallVector<Instruction*, 32> Steps;
79+
Steps.push_back(InstForCheck);
80+
for (auto U : InstForCheck->users()) {
81+
if (Instruction *I = dyn_cast<Instruction>(U)) {
82+
if (I != Root) {
83+
UseStack.push_back({I, InstForCheck, false});
84+
}
85+
}
86+
}
87+
88+
while (UseStack.size()) {
89+
if (Steps.back() != std::get<1>(UseStack.back()))
90+
Steps.pop_back();
91+
92+
Instruction *CurrUse = std::get<0>(UseStack.back());
93+
Instruction *CurrRoot = std::get<1>(UseStack.back());
94+
95+
// If we have already analyzed this instruction.
96+
if (std::get<2>(UseStack.back())) {
97+
UseStack.pop_back();
98+
continue;
99+
}
100+
101+
// Mark use as visited.
102+
std::get<2>(UseStack.back()) = true;
103+
104+
InstType Res = checkInst(CurrUse);
105+
if (Res == InstType::BlcokMemOp) {
106+
Instruction *Op0 = dyn_cast<Instruction>(CurrUse->getOperand(0));
107+
if (Op0 == CurrRoot) {
108+
UseStack.pop_back();
109+
continue;
110+
}
111+
} else if (Res == InstType::PreventScalar) {
112+
return false;
113+
}
114+
115+
if (CurrUse->getNumUses()) {
116+
Steps.push_back(CurrUse);
117+
for (auto U : CurrUse->users()) {
118+
if (Instruction *I = dyn_cast<Instruction>(U)) {
119+
// This check helps to avoid hanging in the following example:
120+
// entry:
121+
// ...
122+
// br label bb1
123+
// bb1:
124+
// %phires = phi i32 [ %0, %entry ], [ %sum, %bb2 ]
125+
// %cmp = icmp ult i32 %phires, 20
126+
// br i1 %cmp, label %bb2, label %bb3
127+
// bb2:
128+
// %sum = add i32 %%phires, 1
129+
// bb3:
130+
// ...
131+
if (std::find(Steps.begin(), Steps.end(), I) != Steps.end())
132+
continue;
133+
134+
UseStack.push_back({I, CurrUse, false});
135+
}
136+
}
137+
} else {
138+
UseStack.pop_back();
139+
}
140+
}
141+
142+
return true;
143+
}
144+
145+
InstType BlockMemOpAddrScalarizationPass::checkInst(Instruction *I) {
146+
bool Check = false;
147+
// If this I instruction is BlockRead/BlockWrite then return true for current user.
148+
if (GenIntrinsicInst *GenInst = dyn_cast<GenIntrinsicInst>(I)) {
149+
GenISAIntrinsic::ID Id = GenInst->getIntrinsicID();
150+
if (Id == GenISAIntrinsic::GenISA_simdBlockRead || Id == GenISAIntrinsic::GenISA_simdBlockWrite)
151+
return InstType::BlcokMemOp;
152+
}
153+
154+
if (I->isBinaryOp())
155+
Check = true;
156+
157+
if (I->isCast())
158+
Check = true;
159+
160+
if (isa<GetElementPtrInst>(I))
161+
Check = true;
162+
163+
if (isa<PHINode>(I))
164+
Check = true;
165+
166+
// Skip intrinsics that don't actually represent code after lowering.
167+
auto canSkipCall = [](Instruction *I) -> bool {
168+
if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I)) {
169+
switch (Intr->getIntrinsicID()) {
170+
default:
171+
break;
172+
case Intrinsic::assume:
173+
case Intrinsic::dbg_declare:
174+
case Intrinsic::dbg_value:
175+
case Intrinsic::dbg_label:
176+
case Intrinsic::lifetime_start:
177+
case Intrinsic::lifetime_end:
178+
return true;
179+
}
180+
}
181+
return false;
182+
};
183+
184+
if (canSkipCall(I))
185+
Check = true;
186+
187+
if (!Check)
188+
return InstType::PreventScalar;
189+
190+
return InstType::CanBeScalar;
191+
}
192+
193+
Value *BlockMemOpAddrScalarizationPass::insertBroadcast(Instruction *InstForBroadcast) {
194+
Value *ShuffleRes = nullptr;
195+
Instruction *PlaceForInsert = nullptr;
196+
197+
if (isa<PHINode>(InstForBroadcast))
198+
PlaceForInsert = InstForBroadcast->getParent()->getFirstNonPHI();
199+
else
200+
PlaceForInsert = InstForBroadcast->getNextNonDebugInstruction();
201+
202+
IRBuilder<> Builder(PlaceForInsert);
203+
204+
if (ExistingBroadcasts.count(InstForBroadcast)) {
205+
// If broadcast was created before.
206+
ShuffleRes = ExistingBroadcasts[InstForBroadcast];
207+
} else {
208+
Type *CurType = InstForBroadcast->getType();
209+
Value *ValForShuffle = nullptr;
210+
211+
if (CurType->getScalarSizeInBits() == 1)
212+
ValForShuffle = Builder.CreateZExtOrTrunc(InstForBroadcast, Builder.getInt8Ty());
213+
else if (CurType->isPointerTy())
214+
ValForShuffle = Builder.CreatePtrToInt(InstForBroadcast, Builder.getInt64Ty());
215+
else
216+
ValForShuffle = cast<Value>(InstForBroadcast);
217+
218+
Value *Args[3] = {ValForShuffle, Builder.getInt32(0), Builder.getInt32(0)};
219+
Type *Types[3] = {ValForShuffle->getType(), Builder.getInt32Ty(), Builder.getInt32Ty()};
220+
Function *BroadcastFunc = GenISAIntrinsic::getDeclaration(InstForBroadcast->getModule(),
221+
GenISAIntrinsic::GenISA_WaveShuffleIndex,
222+
Types);
223+
Value *BroadcastCall = Builder.CreateCall(BroadcastFunc, Args);
224+
225+
if (CurType->getScalarSizeInBits() == 1)
226+
ShuffleRes = Builder.CreateZExtOrTrunc(BroadcastCall, CurType);
227+
else if (CurType->isPointerTy())
228+
ShuffleRes = Builder.CreateIntToPtr(BroadcastCall, CurType);
229+
else
230+
ShuffleRes = BroadcastCall;
231+
232+
ExistingBroadcasts.insert({InstForBroadcast, ShuffleRes});
233+
}
234+
235+
return ShuffleRes;
236+
}
237+
238+
bool BlockMemOpAddrScalarizationPass::scalarizeAddrArithmForBlockRdWr(GenIntrinsicInst *BlockInstr)
239+
{
240+
bool Scalarized = false;
241+
Instruction *AddrInstr = dyn_cast<Instruction>(BlockInstr->getOperand(0));
242+
if (!AddrInstr)
243+
return Scalarized;
244+
245+
// This map will contain instructions (keys) that will be broadcast, and instructions (values) where the result of the broadcast will be used.
246+
DenseMap<Instruction*, SmallVector<Instruction*, 4>> InstForBrd;
247+
248+
SmallVector<Instruction*, 2> V = {AddrInstr};
249+
// This vector contains the root instruction that was checked in previous steps and its operands that will be checked in the current step.
250+
SmallVector<std::tuple<Instruction*, SmallVector<Instruction*, 2>>, 2> InstrVector = {{BlockInstr, V}};
251+
while (InstrVector.size()) {
252+
// Data structure for further iterations.
253+
SmallVector<std::tuple<Instruction*, SmallVector<Instruction*, 2>>, 2> NewInstrVector;
254+
for (const auto &T : InstrVector) {
255+
Instruction *Root = std::get<0>(T);
256+
for (Instruction *I : std::get<1>(T)) {
257+
std::tuple<Instruction*, SmallVector<Instruction*, 2>> NewTuple = {I, SmallVector<Instruction*, 2>()};
258+
if (InstCanBeScalarized.count(I))
259+
continue;
260+
261+
// Check I instruction and its users.
262+
if (canInstBeScalarized(I, Root)) {
263+
InstCanBeScalarized.insert(I);
264+
265+
// Now lets check its arguments.
266+
for (auto Op = I->op_begin(), E = I->op_end(); Op != E; Op++) {
267+
if (Instruction *InOp = dyn_cast<Instruction>(*Op)) {
268+
if (WI->isUniform(InOp))
269+
continue;
270+
271+
std::get<1>(NewTuple).push_back(InOp);
272+
}
273+
}
274+
275+
NewInstrVector.push_back(NewTuple);
276+
} else {
277+
// Terminate the algorithm if the address is NOT used in any instructions other than BlockWrite/BlockRead.
278+
if (I == BlockInstr->getOperand(0))
279+
return Scalarized;
280+
281+
if (InstForBrd.count(I)) {
282+
if (std::find(InstForBrd[I].begin(), InstForBrd[I].end(), Root) != InstForBrd[I].end())
283+
continue;
284+
285+
InstForBrd[I].push_back(Root);
286+
} else {
287+
InstForBrd.insert({I, {Root}});
288+
}
289+
}
290+
}
291+
}
292+
// Update instructions list for next check.
293+
InstrVector = NewInstrVector;
294+
}
295+
296+
// Insert broadcast instructions
297+
for (const auto &Item : InstForBrd) {
298+
Instruction *InstForBrd = Item.first;
299+
300+
if (GenIntrinsicInst *GenInst = dyn_cast<GenIntrinsicInst>(InstForBrd)) {
301+
GenISAIntrinsic::ID Id = GenInst->getIntrinsicID();
302+
if (Id == GenISAIntrinsic::GenISA_WaveShuffleIndex)
303+
continue;
304+
}
305+
306+
Value *BroadcastInstr = insertBroadcast(InstForBrd);
307+
if (!BroadcastInstr)
308+
continue;
309+
Scalarized = true;
310+
311+
for (auto Root : Item.second) {
312+
size_t ArgNum = 0;
313+
for (auto Op = Root->op_begin(), E = Root->op_end(); Op != E; Op++) {
314+
if (dyn_cast<Instruction>(Op) == InstForBrd) {
315+
Root->setOperand(ArgNum, BroadcastInstr);
316+
break;
317+
}
318+
ArgNum++;
319+
}
320+
}
321+
}
322+
323+
return Scalarized;
324+
}

0 commit comments

Comments
 (0)