@@ -83,34 +83,7 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
83
83
if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty ())
84
84
return PreservedAnalyses::all ();
85
85
86
- if (!EnableGlobalOffset) {
87
- SmallVector<CallInst *, 4 > Worklist;
88
- SmallVector<LoadInst *, 4 > LI;
89
- SmallVector<Instruction *, 4 > PtrUses;
90
-
91
- // Collect all GEPs and Loads from the intrinsic's CallInsts
92
- for (Value *V : ImplicitOffsetIntrinsic->users ()) {
93
- Worklist.push_back (cast<CallInst>(V));
94
- for (Value *V2 : V->users ())
95
- getLoads (cast<Instruction>(V2), PtrUses, LI);
96
- }
97
-
98
- // Replace each use of a collected Load with a Constant 0
99
- for (LoadInst *L : LI)
100
- L->replaceAllUsesWith (ConstantInt::get (L->getType (), 0 ));
101
-
102
- // Remove all collected Loads and GEPs from the kernel.
103
- // PtrUses is returned by `getLoads` in topological order.
104
- // Walk it backwards so we don't violate users.
105
- for (auto *I : reverse (PtrUses))
106
- I->eraseFromParent ();
107
-
108
- // Remove all collected CallInsts from the kernel.
109
- for (CallInst *CI : Worklist) {
110
- auto *I = cast<Instruction>(CI);
111
- I->eraseFromParent ();
112
- }
113
- } else {
86
+ if (EnableGlobalOffset) {
114
87
// For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
115
88
// happy with ADDRESS_SPACE_GENERIC (0).
116
89
TargetAS = AT == ArchType::Cuda ? 0 : 5 ;
@@ -133,6 +106,32 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
133
106
// Add implicit parameters to all direct and indirect users of the offset
134
107
addImplicitParameterToCallers (M, ImplicitOffsetIntrinsic, nullptr );
135
108
}
109
+ SmallVector<CallInst *, 4 > Worklist;
110
+ SmallVector<LoadInst *, 4 > Loads;
111
+ SmallVector<Instruction *, 4 > PtrUses;
112
+
113
+ // Collect all GEPs and Loads from the intrinsic's CallInsts
114
+ for (Value *V : ImplicitOffsetIntrinsic->users ()) {
115
+ Worklist.push_back (cast<CallInst>(V));
116
+ for (Value *V2 : V->users ())
117
+ getLoads (cast<Instruction>(V2), PtrUses, Loads);
118
+ }
119
+
120
+ // Replace each use of a collected Load with a Constant 0
121
+ for (LoadInst *L : Loads)
122
+ L->replaceAllUsesWith (ConstantInt::get (L->getType (), 0 ));
123
+
124
+ // Remove all collected Loads and GEPs from the kernel.
125
+ // PtrUses is returned by `getLoads` in topological order.
126
+ // Walk it backwards so we don't violate users.
127
+ for (auto *I : reverse (PtrUses))
128
+ I->eraseFromParent ();
129
+
130
+ // Remove all collected CallInsts from the kernel.
131
+ for (CallInst *CI : Worklist) {
132
+ auto *I = cast<Instruction>(CI);
133
+ I->eraseFromParent ();
134
+ }
136
135
137
136
// Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
138
137
// it.
@@ -161,7 +160,8 @@ void GlobalOffsetPass::processKernelEntryPoint(Function *Func) {
161
160
162
161
auto *NewFunc = addOffsetArgumentToFunction (
163
162
M, Func, KernelImplicitArgumentType->getPointerTo (),
164
- /* KeepOriginal=*/ true )
163
+ /* KeepOriginal=*/ true ,
164
+ /* IsKernel=*/ true )
165
165
.first ;
166
166
Argument *NewArgument = std::prev (NewFunc->arg_end ());
167
167
// Pass byval to the kernel for NVIDIA, AMD's calling convention disallows
@@ -177,62 +177,43 @@ void GlobalOffsetPass::processKernelEntryPoint(Function *Func) {
177
177
FuncMetadata->getOperand (1 ),
178
178
FuncMetadata->getOperand (2 )};
179
179
KernelMetadata->addOperand (MDNode::get (Ctx, NewMetadata));
180
-
181
- // Create alloca of zeros for the implicit offset in the original func.
182
- BasicBlock *EntryBlock = &Func->getEntryBlock ();
183
- IRBuilder<> Builder (EntryBlock, EntryBlock->getFirstInsertionPt ());
184
- Type *ImplicitOffsetType =
185
- ArrayType::get (Type::getInt32Ty (M.getContext ()), 3 );
186
- AllocaInst *ImplicitOffset =
187
- Builder.CreateAlloca (ImplicitOffsetType, TargetAS);
188
- uint64_t AllocByteSize =
189
- ImplicitOffset->getAllocationSizeInBits (M.getDataLayout ()).value () / 8 ;
190
- CallInst *MemsetCall =
191
- Builder.CreateMemSet (ImplicitOffset, Builder.getInt8 (0 ), AllocByteSize,
192
- ImplicitOffset->getAlign ());
193
- MemsetCall->addParamAttr (0 , Attribute::NonNull);
194
- MemsetCall->addDereferenceableParamAttr (0 , AllocByteSize);
195
- ProcessedFunctions[Func] = Builder.CreateConstInBoundsGEP2_32 (
196
- ImplicitOffsetType, ImplicitOffset, 0 , 0 );
197
180
}
198
181
199
182
void GlobalOffsetPass::addImplicitParameterToCallers (
200
183
Module &M, Value *Callee, Function *CalleeWithImplicitParam) {
201
-
202
- // Make sure that all entry point callers are processed.
203
184
SmallVector<User *, 8 > Users{Callee->users ()};
204
- for (User *U : Users) {
205
- auto *Call = dyn_cast<CallInst>(U);
206
- if (!Call)
207
- continue ;
208
185
209
- Function *Caller = Call->getFunction ();
210
- if (EntryPointMetadata.count (Caller) != 0 ) {
211
- processKernelEntryPoint (Caller);
212
- }
213
- }
214
-
215
- // User collection may have changed, so we reinitialize it.
216
- Users = SmallVector<User *, 8 >{Callee->users ()};
217
186
for (User *U : Users) {
218
187
auto *CallToOld = dyn_cast<CallInst>(U);
219
188
if (!CallToOld)
220
189
return ;
221
190
222
191
auto *Caller = CallToOld->getFunction ();
223
192
224
- // Determine if `Caller` needs processed or if this is another callsite
225
- // from an already-processed function.
226
- Function *NewFunc;
193
+ // Only original function uses are considered.
194
+ // Clones are processed through a global VMap.
195
+ if (Clones.contains (Caller))
196
+ continue ;
197
+
198
+ // Kernel entry points need additional processing and change Metdadata.
199
+ if (EntryPointMetadata.count (Caller) != 0 )
200
+ processKernelEntryPoint (Caller);
201
+
202
+ // Determine if `Caller` needs to be processed or if this is another
203
+ // callsite from a non-offset function or an already-processed function.
227
204
Value *ImplicitOffset = ProcessedFunctions[Caller];
228
205
bool AlreadyProcessed = ImplicitOffset != nullptr ;
206
+
207
+ Function *NewFunc;
229
208
if (AlreadyProcessed) {
230
209
NewFunc = Caller;
231
210
} else {
232
211
std::tie (NewFunc, ImplicitOffset) =
233
- addOffsetArgumentToFunction (M, Caller);
212
+ addOffsetArgumentToFunction (M, Caller,
213
+ /* KernelImplicitArgumentType*/ nullptr ,
214
+ /* KeepOriginal=*/ true );
234
215
}
235
-
216
+ CallToOld = cast<CallInst>(GlobalVMap[CallToOld]);
236
217
if (!CalleeWithImplicitParam) {
237
218
// Replace intrinsic call with parameter.
238
219
CallToOld->replaceAllUsesWith (ImplicitOffset);
@@ -269,15 +250,12 @@ void GlobalOffsetPass::addImplicitParameterToCallers(
269
250
270
251
// Process callers of the old function.
271
252
addImplicitParameterToCallers (M, Caller, NewFunc);
272
-
273
- // Now that the old function is dead, delete it.
274
- Caller->dropAllReferences ();
275
- Caller->eraseFromParent ();
276
253
}
277
254
}
278
255
279
256
std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction (
280
- Module &M, Function *Func, Type *ImplicitArgumentType, bool KeepOriginal) {
257
+ Module &M, Function *Func, Type *ImplicitArgumentType, bool KeepOriginal,
258
+ bool IsKernel) {
281
259
FunctionType *FuncTy = Func->getFunctionType ();
282
260
const AttributeList &FuncAttrs = Func->getAttributes ();
283
261
ImplicitArgumentType =
@@ -316,23 +294,22 @@ std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
316
294
// TODO: Are there better naming alternatives that allow for unmangling?
317
295
NewFunc->setName (Func->getName () + " _with_offset" );
318
296
319
- ValueToValueMapTy VMap;
320
297
for (Function::arg_iterator FuncArg = Func->arg_begin (),
321
298
FuncEnd = Func->arg_end (),
322
299
NewFuncArg = NewFunc->arg_begin ();
323
300
FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
324
- VMap [FuncArg] = NewFuncArg;
301
+ GlobalVMap [FuncArg] = NewFuncArg;
325
302
}
326
303
327
304
SmallVector<ReturnInst *, 8 > Returns;
328
- CloneFunctionInto (NewFunc, Func, VMap ,
305
+ CloneFunctionInto (NewFunc, Func, GlobalVMap ,
329
306
CloneFunctionChangeType::GlobalChanges, Returns);
330
307
// In order to keep the signatures of functions called by the kernel
331
308
// unified, the pass has to copy global offset to an array allocated in
332
309
// addrspace(3). This is done as kernels can't allocate and fill the
333
- // array in constant address space, which would be required for the case
334
- // with no global offset .
335
- if (AT == ArchType::AMDHSA) {
310
+ // array in constant address space.
311
+ // Not required any longer, but left due to deprecatedness .
312
+ if (IsKernel && AT == ArchType::AMDHSA) {
336
313
BasicBlock *EntryBlock = &NewFunc->getEntryBlock ();
337
314
IRBuilder<> Builder (EntryBlock, EntryBlock->getFirstInsertionPt ());
338
315
Type *ImplicitOffsetType =
@@ -399,8 +376,8 @@ std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
399
376
Type::getInt32Ty (M.getContext ())->getPointerTo (TargetAS));
400
377
}
401
378
402
- ProcessedFunctions[NewFunc ] = ImplicitOffset;
403
-
379
+ ProcessedFunctions[Func ] = ImplicitOffset;
380
+ Clones. insert (NewFunc);
404
381
// Return the new function and the offset argument.
405
382
return {NewFunc, ImplicitOffset};
406
383
}
0 commit comments