Skip to content

Commit 03be036

Browse files
author
MartinWehking
authored
[SYCL] Extend global offset intrinsic removal (#11909)
Extend #11674 by modifying the globaloffset optimization pass to always replace uses of Loads from the `llvm.nvvm.implicit.offset` and `llvm.amdgcn.implicit.offset intrinsics` with constant zeros in the original non-offset kernel. Hence, perform the optimization even when `-enable-global-offset=true` (default). Duplicate recursively functions containing calls to the implicit offset intrinsic and let the implicit offset kernel entry point only call the original functions (i.e. do not call the functions with added offset arguments). Remove zero allocations for the original kernel entry points.
1 parent 6cd3ef2 commit 03be036

12 files changed

+391
-195
lines changed

llvm/include/llvm/SYCLLowerIR/GlobalOffset.h

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "llvm/IR/Module.h"
1313
#include "llvm/IR/PassManager.h"
1414
#include "llvm/SYCLLowerIR/TargetHelpers.h"
15+
#include "llvm/Transforms/Utils/Cloning.h"
1516

1617
namespace llvm {
1718

@@ -38,41 +39,38 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
3839
/// `Func` belongs, contains both the original function and its clone with the
3940
/// signature extended with the implicit offset parameter and `_with_offset`
4041
/// appended to the name.
41-
/// An alloca of 3 zeros (corresponding to offsets in x, y and z) is added to
42-
/// the original kernel, in order to keep the interface of kernel's call
43-
/// graph unified, regardless of the fact if the global offset has been used.
4442
///
4543
/// \param Func Kernel to be processed.
4644
void processKernelEntryPoint(Function *Func);
4745

48-
/// This function adds an implicit parameter to the function containing a
49-
/// call instruction to the implicit offset intrinsic or another function
50-
/// (which eventually calls the instrinsic). If the call instruction is to
51-
/// the implicit offset intrinsic, then the intrinisic is replaced with the
52-
/// parameter that was added.
46+
/// For a function containing a call instruction to the implicit offset
47+
/// intrinsic, or another function which eventually calls the intrinsic,
48+
/// this function clones the function and adds an implicit parameter to the
49+
/// clone.
50+
/// If the call instruction is to the implicit offset intrinsic then the
51+
/// intrinsic inside the cloned function is replaced with the parameter that
52+
/// was added.
5353
///
54-
/// Once the function, say `F`, containing a call to `Callee` has the
55-
/// implicit parameter added, callers of `F` are processed by recursively
56-
/// calling this function, passing `F` to `CalleeWithImplicitParam`.
57-
///
58-
/// Since the cloning of entry points may alter the users of a function, the
59-
/// cloning must be done as early as possible, as to ensure that no users are
60-
/// added to previous callees in the call-tree.
54+
/// Once the clone of a function, say `F`, containing a call to `Callee`
55+
/// has the implicit parameter added, callers of `F` are processed by
56+
/// getting cloned and their clones are processed by recursively calling the
57+
/// clone of 'F', passing `F` to `CalleeWithImplicitParam`.
6158
///
6259
/// \param Callee is the function (to which this transformation has already
6360
/// been applied), or to the implicit offset intrinsic.
6461
///
6562
/// \param CalleeWithImplicitParam indicates whether Callee is to the
6663
/// implicit intrinsic (when `nullptr`) or to another function (not
67-
/// `nullptr`) - this is used to know whether calls to it needs to have the
68-
/// implicit parameter added to it or replaced with the implicit parameter.
64+
/// `nullptr`) - this is used to know whether calls to it inside clones need
65+
/// to have the implicit parameter added to it or be replaced with the
66+
/// implicit parameter.
6967
void addImplicitParameterToCallers(Module &M, Value *Callee,
7068
Function *CalleeWithImplicitParam);
7169

72-
/// For a given function `Func` extend signature to contain an implicit
73-
/// offset argument.
70+
/// For a given function `Func` create a clone and extend its signature to
71+
/// contain an implicit offset argument.
7472
///
75-
/// \param Func A function to add offset to.
73+
/// \param Func A function to be cloned and add offset to.
7674
///
7775
/// \param ImplicitArgumentType Architecture dependant type of the implicit
7876
/// argument holding the global offset.
@@ -81,13 +79,15 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
8179
/// keep it intact and create a clone of it with `_wit_offset` appended to
8280
/// the name.
8381
///
84-
/// \returns A pair of new function with the offset argument added and a
82+
/// \param IsKernel Indicates whether Func is a kernel entry point.
83+
///
84+
/// \returns A pair of the new function with the offset argument added, a
8585
/// pointer to the implicit argument (either a func argument or a bitcast
8686
/// turning it to the correct type).
8787
std::pair<Function *, Value *>
8888
addOffsetArgumentToFunction(Module &M, Function *Func,
8989
Type *ImplicitArgumentType = nullptr,
90-
bool KeepOriginal = false);
90+
bool KeepOriginal = false, bool IsKernel = false);
9191

9292
/// Create a mapping of kernel entry points to their metadata nodes. While
9393
/// iterating over kernels make sure that a given kernel entry point has no
@@ -102,8 +102,12 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
102102
SmallVectorImpl<KernelPayload> &KernelPayloads);
103103

104104
private:
105-
/// Keep track of which functions have been processed to avoid processing
106-
/// twice.
105+
/// Keep track of all cloned offset functions to avoid processing them.
106+
llvm::SmallPtrSet<Function *, 8> Clones;
107+
/// Save clone mappings to obtain pointers to CallInsts during processing.
108+
llvm::ValueToValueMapTy GlobalVMap;
109+
/// Keep track of which non-offset functions have been processed to avoid
110+
/// processing twice.
107111
llvm::DenseMap<Function *, Value *> ProcessedFunctions;
108112
/// Keep a map of all entry point functions with metadata.
109113
llvm::DenseMap<Function *, MDNode *> EntryPointMetadata;

llvm/lib/SYCLLowerIR/GlobalOffset.cpp

Lines changed: 55 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -83,34 +83,7 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
8383
if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty())
8484
return PreservedAnalyses::all();
8585

86-
if (!EnableGlobalOffset) {
87-
SmallVector<CallInst *, 4> Worklist;
88-
SmallVector<LoadInst *, 4> LI;
89-
SmallVector<Instruction *, 4> PtrUses;
90-
91-
// Collect all GEPs and Loads from the intrinsic's CallInsts
92-
for (Value *V : ImplicitOffsetIntrinsic->users()) {
93-
Worklist.push_back(cast<CallInst>(V));
94-
for (Value *V2 : V->users())
95-
getLoads(cast<Instruction>(V2), PtrUses, LI);
96-
}
97-
98-
// Replace each use of a collected Load with a Constant 0
99-
for (LoadInst *L : LI)
100-
L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0));
101-
102-
// Remove all collected Loads and GEPs from the kernel.
103-
// PtrUses is returned by `getLoads` in topological order.
104-
// Walk it backwards so we don't violate users.
105-
for (auto *I : reverse(PtrUses))
106-
I->eraseFromParent();
107-
108-
// Remove all collected CallInsts from the kernel.
109-
for (CallInst *CI : Worklist) {
110-
auto *I = cast<Instruction>(CI);
111-
I->eraseFromParent();
112-
}
113-
} else {
86+
if (EnableGlobalOffset) {
11487
// For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
11588
// happy with ADDRESS_SPACE_GENERIC (0).
11689
TargetAS = AT == ArchType::Cuda ? 0 : 5;
@@ -133,6 +106,32 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
133106
// Add implicit parameters to all direct and indirect users of the offset
134107
addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
135108
}
109+
SmallVector<CallInst *, 4> Worklist;
110+
SmallVector<LoadInst *, 4> Loads;
111+
SmallVector<Instruction *, 4> PtrUses;
112+
113+
// Collect all GEPs and Loads from the intrinsic's CallInsts
114+
for (Value *V : ImplicitOffsetIntrinsic->users()) {
115+
Worklist.push_back(cast<CallInst>(V));
116+
for (Value *V2 : V->users())
117+
getLoads(cast<Instruction>(V2), PtrUses, Loads);
118+
}
119+
120+
// Replace each use of a collected Load with a Constant 0
121+
for (LoadInst *L : Loads)
122+
L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0));
123+
124+
// Remove all collected Loads and GEPs from the kernel.
125+
// PtrUses is returned by `getLoads` in topological order.
126+
// Walk it backwards so we don't violate users.
127+
for (auto *I : reverse(PtrUses))
128+
I->eraseFromParent();
129+
130+
// Remove all collected CallInsts from the kernel.
131+
for (CallInst *CI : Worklist) {
132+
auto *I = cast<Instruction>(CI);
133+
I->eraseFromParent();
134+
}
136135

137136
// Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
138137
// it.
@@ -161,7 +160,8 @@ void GlobalOffsetPass::processKernelEntryPoint(Function *Func) {
161160

162161
auto *NewFunc = addOffsetArgumentToFunction(
163162
M, Func, KernelImplicitArgumentType->getPointerTo(),
164-
/*KeepOriginal=*/true)
163+
/*KeepOriginal=*/true,
164+
/*IsKernel=*/true)
165165
.first;
166166
Argument *NewArgument = std::prev(NewFunc->arg_end());
167167
// Pass byval to the kernel for NVIDIA, AMD's calling convention disallows
@@ -177,62 +177,43 @@ void GlobalOffsetPass::processKernelEntryPoint(Function *Func) {
177177
FuncMetadata->getOperand(1),
178178
FuncMetadata->getOperand(2)};
179179
KernelMetadata->addOperand(MDNode::get(Ctx, NewMetadata));
180-
181-
// Create alloca of zeros for the implicit offset in the original func.
182-
BasicBlock *EntryBlock = &Func->getEntryBlock();
183-
IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
184-
Type *ImplicitOffsetType =
185-
ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
186-
AllocaInst *ImplicitOffset =
187-
Builder.CreateAlloca(ImplicitOffsetType, TargetAS);
188-
uint64_t AllocByteSize =
189-
ImplicitOffset->getAllocationSizeInBits(M.getDataLayout()).value() / 8;
190-
CallInst *MemsetCall =
191-
Builder.CreateMemSet(ImplicitOffset, Builder.getInt8(0), AllocByteSize,
192-
ImplicitOffset->getAlign());
193-
MemsetCall->addParamAttr(0, Attribute::NonNull);
194-
MemsetCall->addDereferenceableParamAttr(0, AllocByteSize);
195-
ProcessedFunctions[Func] = Builder.CreateConstInBoundsGEP2_32(
196-
ImplicitOffsetType, ImplicitOffset, 0, 0);
197180
}
198181

199182
void GlobalOffsetPass::addImplicitParameterToCallers(
200183
Module &M, Value *Callee, Function *CalleeWithImplicitParam) {
201-
202-
// Make sure that all entry point callers are processed.
203184
SmallVector<User *, 8> Users{Callee->users()};
204-
for (User *U : Users) {
205-
auto *Call = dyn_cast<CallInst>(U);
206-
if (!Call)
207-
continue;
208185

209-
Function *Caller = Call->getFunction();
210-
if (EntryPointMetadata.count(Caller) != 0) {
211-
processKernelEntryPoint(Caller);
212-
}
213-
}
214-
215-
// User collection may have changed, so we reinitialize it.
216-
Users = SmallVector<User *, 8>{Callee->users()};
217186
for (User *U : Users) {
218187
auto *CallToOld = dyn_cast<CallInst>(U);
219188
if (!CallToOld)
220189
return;
221190

222191
auto *Caller = CallToOld->getFunction();
223192

224-
// Determine if `Caller` needs processed or if this is another callsite
225-
// from an already-processed function.
226-
Function *NewFunc;
193+
// Only original function uses are considered.
194+
// Clones are processed through a global VMap.
195+
if (Clones.contains(Caller))
196+
continue;
197+
198+
// Kernel entry points need additional processing and change Metdadata.
199+
if (EntryPointMetadata.count(Caller) != 0)
200+
processKernelEntryPoint(Caller);
201+
202+
// Determine if `Caller` needs to be processed or if this is another
203+
// callsite from a non-offset function or an already-processed function.
227204
Value *ImplicitOffset = ProcessedFunctions[Caller];
228205
bool AlreadyProcessed = ImplicitOffset != nullptr;
206+
207+
Function *NewFunc;
229208
if (AlreadyProcessed) {
230209
NewFunc = Caller;
231210
} else {
232211
std::tie(NewFunc, ImplicitOffset) =
233-
addOffsetArgumentToFunction(M, Caller);
212+
addOffsetArgumentToFunction(M, Caller,
213+
/*KernelImplicitArgumentType*/ nullptr,
214+
/*KeepOriginal=*/true);
234215
}
235-
216+
CallToOld = cast<CallInst>(GlobalVMap[CallToOld]);
236217
if (!CalleeWithImplicitParam) {
237218
// Replace intrinsic call with parameter.
238219
CallToOld->replaceAllUsesWith(ImplicitOffset);
@@ -269,15 +250,12 @@ void GlobalOffsetPass::addImplicitParameterToCallers(
269250

270251
// Process callers of the old function.
271252
addImplicitParameterToCallers(M, Caller, NewFunc);
272-
273-
// Now that the old function is dead, delete it.
274-
Caller->dropAllReferences();
275-
Caller->eraseFromParent();
276253
}
277254
}
278255

279256
std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
280-
Module &M, Function *Func, Type *ImplicitArgumentType, bool KeepOriginal) {
257+
Module &M, Function *Func, Type *ImplicitArgumentType, bool KeepOriginal,
258+
bool IsKernel) {
281259
FunctionType *FuncTy = Func->getFunctionType();
282260
const AttributeList &FuncAttrs = Func->getAttributes();
283261
ImplicitArgumentType =
@@ -316,23 +294,22 @@ std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
316294
// TODO: Are there better naming alternatives that allow for unmangling?
317295
NewFunc->setName(Func->getName() + "_with_offset");
318296

319-
ValueToValueMapTy VMap;
320297
for (Function::arg_iterator FuncArg = Func->arg_begin(),
321298
FuncEnd = Func->arg_end(),
322299
NewFuncArg = NewFunc->arg_begin();
323300
FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
324-
VMap[FuncArg] = NewFuncArg;
301+
GlobalVMap[FuncArg] = NewFuncArg;
325302
}
326303

327304
SmallVector<ReturnInst *, 8> Returns;
328-
CloneFunctionInto(NewFunc, Func, VMap,
305+
CloneFunctionInto(NewFunc, Func, GlobalVMap,
329306
CloneFunctionChangeType::GlobalChanges, Returns);
330307
// In order to keep the signatures of functions called by the kernel
331308
// unified, the pass has to copy global offset to an array allocated in
332309
// addrspace(3). This is done as kernels can't allocate and fill the
333-
// array in constant address space, which would be required for the case
334-
// with no global offset.
335-
if (AT == ArchType::AMDHSA) {
310+
// array in constant address space.
311+
// Not required any longer, but left due to deprecatedness.
312+
if (IsKernel && AT == ArchType::AMDHSA) {
336313
BasicBlock *EntryBlock = &NewFunc->getEntryBlock();
337314
IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
338315
Type *ImplicitOffsetType =
@@ -399,8 +376,8 @@ std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
399376
Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS));
400377
}
401378

402-
ProcessedFunctions[NewFunc] = ImplicitOffset;
403-
379+
ProcessedFunctions[Func] = ImplicitOffset;
380+
Clones.insert(NewFunc);
404381
// Return the new function and the offset argument.
405382
return {NewFunc, ImplicitOffset};
406383
}

llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,27 @@ declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
1111
; CHECK-NOT: llvm.amdgcn.implicit.offset
1212

1313
define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
14-
; CHECK: define weak_odr dso_local i64 @_ZTS14other_function(ptr addrspace(5) %0) !dbg !11 {
14+
; CHECK: define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
1515
%1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
1616
%2 = getelementptr inbounds i32, ptr addrspace(5) %1, i64 2
1717
%3 = load i32, ptr addrspace(5) %2, align 4
1818
%4 = zext i32 %3 to i64
1919
ret i64 %4
2020
}
2121

22+
; CHECK: weak_odr dso_local i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %0) !dbg !14 {
23+
2224
; Function Attrs: noinline
2325
define weak_odr dso_local void @_ZTS14example_kernel() !dbg !14 {
24-
; CHECK: define weak_odr dso_local void @_ZTS14example_kernel() !dbg !14 {
26+
; CHECK: define weak_odr dso_local void @_ZTS14example_kernel() !dbg !15 {
2527
entry:
2628
%0 = call i64 @_ZTS14other_function(), !dbg !15
27-
; CHECK: %2 = call i64 @_ZTS14other_function(ptr addrspace(5) %1), !dbg !15
29+
; CHECK: %0 = call i64 @_ZTS14other_function(), !dbg !16
2830
ret void
2931
}
3032

31-
; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset(ptr byref([3 x i32]) %0) !dbg !16 {
32-
; CHECK: %1 = alloca [3 x i32], align 4, addrspace(5), !dbg !17
33-
; CHECK: %2 = addrspacecast ptr %0 to ptr addrspace(4), !dbg !17
34-
; CHECK: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 %1, ptr addrspace(4) align 1 %2, i64 12, i1 false), !dbg !17
35-
; CHECK: %3 = call i64 @_ZTS14other_function(ptr addrspace(5) %1), !dbg !17
33+
; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset(ptr byref([3 x i32]) %0) !dbg !17 {
34+
; CHECK: call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %1), !dbg !18
3635

3736
!llvm.dbg.cu = !{!0}
3837
!llvm.module.flags = !{!3, !4}
@@ -53,5 +52,8 @@ entry:
5352
!13 = !{null}
5453
!14 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
5554
!15 = !DILocation(line: 1, column: 2, scope: !14)
56-
; CHECK: !16 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
57-
; CHECK: !17 = !DILocation(line: 1, column: 2, scope: !16)
55+
; CHECK: !14 = distinct !DISubprogram(name: "other_function", scope: !1, file: !1, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
56+
; CHECK: !15 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
57+
; CHECK: !16 = !DILocation(line: 1, column: 2, scope: !15)
58+
; CHECK: !17 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
59+
; CHECK: !18 = !DILocation(line: 1, column: 2, scope: !17)

0 commit comments

Comments
 (0)