Skip to content

[SYCL] Correctly handle debug information in global offset pass #16963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions llvm/include/llvm/SYCLLowerIR/GlobalOffset.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,24 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
static StringRef getPassName() { return "Add implicit SYCL global offset"; }

private:
/// In order to correctly update the debug info (if present) we have to
/// populate the global value-to-value map with all original-to-cloned
/// function. To achieve that traverse the call stack and create all the
/// clones (without providing the body).
///
/// \param KCache Kernel bookkeeping helper.
/// \param ImplicitOffsetIntrinsic Implicit offset intrinsic, provides a
/// starting point in search for all the functions that need to be clone.
void createClonesAndPopulateVMap(const TargetHelpers::KernelCache &KCache,
Function *ImplicitOffsetIntrinsic);

/// After the execution of this function, the module to which the kernel
/// `Func` belongs, contains both the original function and its clone with the
/// signature extended with the implicit offset parameter and `_with_offset`
/// appended to the name.
///
/// \param Func Kernel to be processed.
/// \param KCache Kernel bookkeeping helper.
void processKernelEntryPoint(Function *Func,
TargetHelpers::KernelCache &KCache);

Expand All @@ -61,6 +73,7 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
/// `nullptr`) - this is used to know whether calls to it inside clones need
/// to have the implicit parameter added to it or be replaced with the
/// implicit parameter.
/// \param KCache Kernel bookkeeping helper.
void addImplicitParameterToCallers(Module &M, Value *Callee,
Function *CalleeWithImplicitParam,
TargetHelpers::KernelCache &KCache);
Expand Down Expand Up @@ -99,6 +112,9 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
llvm::Type *KernelImplicitArgumentType = nullptr;
/// A type used for the alloca holding the values of global offsets.
llvm::Type *ImplicitOffsetPtrType = nullptr;
/// Track newly created DISUbprograms (that are attached to cloned
/// functions), for ease of mapping, use the old function's name as the key.
llvm::DenseMap<StringRef, DISubprogram *> DISubprogramMap;

unsigned TargetAS = 0;
};
Expand Down
103 changes: 79 additions & 24 deletions llvm/lib/SYCLLowerIR/GlobalOffset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//

#include "llvm/SYCLLowerIR/GlobalOffset.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
Expand All @@ -18,6 +18,7 @@
#include "llvm/Target/TargetIntrinsicInfo.h"
#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <deque>

using namespace llvm;

Expand Down Expand Up @@ -91,6 +92,72 @@ static void validateKernels(Module &M, TargetHelpers::KernelCache &KCache) {
}
}

void GlobalOffsetPass::createClonesAndPopulateVMap(
const TargetHelpers::KernelCache &KCache,
Function *ImplicitOffsetIntrinsic) {
std::deque<User *> WorkList;
for (auto *U : ImplicitOffsetIntrinsic->users())
WorkList.emplace_back(U);

while (!WorkList.empty()) {
auto *WI = WorkList.front();
WorkList.pop_front();
auto *Call = dyn_cast<CallInst>(WI);
if (!Call)
continue; // Not interesting.

auto *Func = Call->getFunction();
if (0 != GlobalVMap.count(Func))
continue; // Already processed.

const bool IsKernel = KCache.isKernel(*Func);
FunctionType *FuncTy = Func->getFunctionType();
Type *ImplicitArgumentType =
IsKernel ? KernelImplicitArgumentType->getPointerTo()
: ImplicitOffsetPtrType;

// Construct an argument list containing all of the previous arguments.
SmallVector<Type *, 8> Arguments;
for (const auto &A : Func->args())
Arguments.push_back(A.getType());

// Add the offset argument. Must be the same type as returned by
// `llvm.{amdgcn|nvvm}.implicit.offset`.
Arguments.push_back(ImplicitArgumentType);

// Build the new function.
if (FuncTy->isVarArg())
llvm_unreachable("Variadic arguments prohibited in SYCL");
FunctionType *NewFuncTy = FunctionType::get(FuncTy->getReturnType(),
Arguments, FuncTy->isVarArg());
Function *NewFunc = Function::Create(NewFuncTy, Func->getLinkage(),
Func->getAddressSpace());
NewFunc->setName(Func->getName() + "_with_offset");
// Remove the subprogram, if exists, as it will be pointing to an incorrect
// data.
if (Func->getSubprogram())
NewFunc->setSubprogram(nullptr);

// Keep original function ordering, clone goes right after the original.
Func->getParent()->getFunctionList().insertAfter(Func->getIterator(),
NewFunc);

// Populate the global value to value map with function arguments as well
// as the cloned function itself.
for (Function::arg_iterator FuncArg = Func->arg_begin(),
FuncEnd = Func->arg_end(),
NewFuncArg = NewFunc->arg_begin();
FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
GlobalVMap[FuncArg] = NewFuncArg;
}
GlobalVMap[Func] = NewFunc;

// Extend the work list with the users of the function.
for (auto *U : Func->users())
WorkList.emplace_back(U);
}
}

// New PM implementation.
PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
// Only run this pass on SYCL device code
Expand Down Expand Up @@ -128,6 +195,8 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
// Validate kernels
validateKernels(M, KCache);

createClonesAndPopulateVMap(KCache, ImplicitOffsetIntrinsic);

// Add implicit parameters to all direct and indirect users of the offset
addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr, KCache);
}
Expand Down Expand Up @@ -163,6 +232,7 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
assert(ImplicitOffsetIntrinsic->use_empty() &&
"Not all uses of intrinsic removed");
ImplicitOffsetIntrinsic->eraseFromParent();

return PreservedAnalyses::none();
}

Expand Down Expand Up @@ -226,10 +296,10 @@ void GlobalOffsetPass::addImplicitParameterToCallers(
if (AlreadyProcessed) {
NewFunc = Caller;
} else {
std::tie(NewFunc, ImplicitOffset) =
addOffsetArgumentToFunction(M, Caller,
/*KernelImplicitArgumentType*/ nullptr,
/*KeepOriginal=*/true);
std::tie(NewFunc, ImplicitOffset) = addOffsetArgumentToFunction(
M, Caller,
/*KernelImplicitArgumentType*/ nullptr,
/*KeepOriginal=*/true, /*IsKernel=*/false);
}
CallToOld = cast<CallInst>(GlobalVMap[CallToOld]);
if (!CalleeWithImplicitParam) {
Expand Down Expand Up @@ -296,32 +366,17 @@ std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
AttributeList NAttrs =
AttributeList::get(Func->getContext(), FuncAttrs.getFnAttrs(),
FuncAttrs.getRetAttrs(), ArgumentAttributes);
assert(!FuncTy->isVarArg() && "Variadic arguments prohibited in SYCL");
FunctionType *NewFuncTy =
FunctionType::get(FuncTy->getReturnType(), Arguments, FuncTy->isVarArg());

Function *NewFunc =
Function::Create(NewFuncTy, Func->getLinkage(), Func->getAddressSpace());

// Keep original function ordering.
M.getFunctionList().insertAfter(Func->getIterator(), NewFunc);
assert(GlobalVMap.count(Func) != 0 &&
"All relevant functions must be prepared ahead of time.");
Function *NewFunc = dyn_cast<Function>(GlobalVMap[Func]);

Value *ImplicitOffset = nullptr;
bool ImplicitOffsetAllocaInserted = false;
if (KeepOriginal) {
// TODO: Are there better naming alternatives that allow for unmangling?
NewFunc->setName(Func->getName() + "_with_offset");

for (Function::arg_iterator FuncArg = Func->arg_begin(),
FuncEnd = Func->arg_end(),
NewFuncArg = NewFunc->arg_begin();
FuncArg != FuncEnd; ++FuncArg, ++NewFuncArg) {
GlobalVMap[FuncArg] = NewFuncArg;
}

SmallVector<ReturnInst *, 8> Returns;
CloneFunctionInto(NewFunc, Func, GlobalVMap,
CloneFunctionChangeType::GlobalChanges, Returns);

// In order to keep the signatures of functions called by the kernel
// unified, the pass has to copy global offset to an array allocated in
// addrspace(3). This is done as kernels can't allocate and fill the
Expand Down
35 changes: 18 additions & 17 deletions llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -37,33 +37,33 @@ entry:
!14 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
!15 = !DILocation(line: 1, column: 2, scope: !14)
; CHECK-LABEL: define i64 @_ZTS14other_function(
; CHECK-SAME: ) !dbg [[DBG5:![0-9]+]] {
; CHECK-SAME: ) !dbg [[DBG6:![0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 0 to i64
; CHECK-NEXT: ret i64 [[TMP1]]
;
;
; CHECK-LABEL: define i64 @_ZTS14other_function_with_offset(
; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) !dbg [[DBG8:![0-9]+]] {
; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) !dbg [[DBG9:![0-9]+]] {
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
; CHECK-NEXT: ret i64 [[TMP4]]
;
;
; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel(
; CHECK-SAME: ) !dbg [[DBG9:![0-9]+]] {
; CHECK-SAME: ) !dbg [[DBG10:![0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @_ZTS14other_function(), !dbg [[DBG10:![0-9]+]]
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @_ZTS14other_function(), !dbg [[DBG11:![0-9]+]]
; CHECK-NEXT: ret void
;
;
; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel_with_offset(
; CHECK-SAME: ptr byref([3 x i32]) [[TMP0:%.*]]) !dbg [[DBG11:![0-9]+]] {
; CHECK-SAME: ptr byref([3 x i32]) [[TMP0:%.*]]) !dbg [[DBG12:![0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP1:%.*]] = alloca [3 x i32], align 4, addrspace(5), !dbg [[DBG12:![0-9]+]]
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(4), !dbg [[DBG12]]
; CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 [[TMP1]], ptr addrspace(4) align 1 [[TMP2]], i64 12, i1 false), !dbg [[DBG12]]
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) [[TMP1]]), !dbg [[DBG12]]
; CHECK-NEXT: [[TMP1:%.*]] = alloca [3 x i32], align 4, addrspace(5), !dbg [[DBG13:![0-9]+]]
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(4), !dbg [[DBG13]]
; CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 [[TMP1]], ptr addrspace(4) align 1 [[TMP2]], i64 12, i1 false), !dbg [[DBG13]]
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) [[TMP1]]), !dbg [[DBG13]]
; CHECK-NEXT: ret void
;
;.
Expand All @@ -74,12 +74,13 @@ entry:
; CHECK: [[META2]] = !{}
; CHECK: [[META3:![0-9]+]] = !{i32 2, !"Dwarf Version", i32 4}
; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
; CHECK: [[DBG5]] = distinct !DISubprogram(name: "other_function", scope: [[META1]], file: [[META1]], line: 3, type: [[META6:![0-9]+]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
; CHECK: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]])
; CHECK: [[META7]] = !{null}
; CHECK: [[DBG8]] = distinct !DISubprogram(name: "other_function", scope: [[META1]], file: [[META1]], line: 3, type: [[META6]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
; CHECK: [[DBG9]] = distinct !DISubprogram(name: "example_kernel", scope: [[META1]], file: [[META1]], line: 10, type: [[META6]], scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
; CHECK: [[DBG10]] = !DILocation(line: 1, column: 2, scope: [[DBG9]])
; CHECK: [[DBG11]] = distinct !DISubprogram(name: "example_kernel", scope: [[META1]], file: [[META1]], line: 10, type: [[META6]], scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
; CHECK: [[DBG12]] = !DILocation(line: 1, column: 2, scope: [[DBG11]])
; CHECK: [[META5:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
; CHECK: [[DBG6]] = distinct !DISubprogram(name: "other_function", scope: [[META1]], file: [[META1]], line: 3, type: [[META7:![0-9]+]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
; CHECK: [[META7]] = !DISubroutineType(types: [[META8:![0-9]+]])
; CHECK: [[META8]] = !{null}
; CHECK: [[DBG9]] = distinct !DISubprogram(name: "other_function", scope: [[META1]], file: [[META1]], line: 3, type: [[META7]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
; CHECK: [[DBG10]] = distinct !DISubprogram(name: "example_kernel", scope: [[META1]], file: [[META1]], line: 10, type: [[META7]], scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
; CHECK: [[DBG11]] = !DILocation(line: 1, column: 2, scope: [[DBG10]])
; CHECK: [[DBG12]] = distinct !DISubprogram(name: "example_kernel", scope: [[META1]], file: [[META1]], line: 10, type: [[META7]], scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
; CHECK: [[DBG13]] = !DILocation(line: 1, column: 2, scope: [[DBG12]])
;.
69 changes: 69 additions & 0 deletions llvm/test/CodeGen/NVPTX/global-offset-dbg-in-func-body.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s

; Make sure that the debug nodes inside a function are correctly updated in a
; cloned function, such that they point to the clone, not to the original
; function. Notice, how DBG11 references DBG10 (and not DBG5, which is the
; original function).

target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

define i64 @_ZN7__spirv21getGlobalInvocationIdILi1EEEmv() !dbg !5 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !9
%1 = call ptr @llvm.nvvm.implicit.offset()
ret i64 0
}

; Function Attrs: nounwind speculatable memory(none)
declare ptr @llvm.nvvm.implicit.offset() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1

attributes #0 = { nounwind speculatable memory(none) }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}

!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 0.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "test.cpp", directory: "/")
!2 = !{}
!3 = !{i32 2, !"Debug Info Version", i32 3}
!4 = !{i32 1, !"sycl-device", i32 1}
!5 = distinct !DISubprogram(name: "getGlobalInvocationId<1>", linkageName: "_ZN7__spirv21getGlobalInvocationIdILi1EEEmv", scope: !7, file: !6, line: 201, type: !8, scopeLine: 201, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, templateParams: !2)
!6 = !DIFile(filename: "header.hpp", directory: "/")
!7 = !DINamespace(name: "__spirv", scope: null)
!8 = distinct !DISubroutineType(types: !2)
!9 = !DILocation(line: 201, column: 1, scope: !5)
; CHECK-LABEL: define i64 @_ZN7__spirv21getGlobalInvocationIdILi1EEEmv(
; CHECK-SAME: ) !dbg [[DBG5:![0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg [[DBG9:![0-9]+]]
; CHECK-NEXT: ret i64 0
;
;
; CHECK-LABEL: define i64 @_ZN7__spirv21getGlobalInvocationIdILi1EEEmv_with_offset(
; CHECK-SAME: ptr [[TMP0:%.*]]) !dbg [[DBG10:![0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg [[DBG11:![0-9]+]]
; CHECK-NEXT: ret i64 0
;
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], retainedTypes: [[META2]], globals: [[META2]], imports: [[META2]], splitDebugInlining: false, nameTableKind: None)
; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}})
; CHECK: [[META2]] = !{}
; CHECK: [[META3:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
; CHECK: [[META4:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
; CHECK: [[DBG5]] = distinct !DISubprogram(name: "getGlobalInvocationId<1>", linkageName: "_ZN7__spirv21getGlobalInvocationIdILi1EEEmv", scope: [[META7:![0-9]+]], file: [[META6:![0-9]+]], line: 201, type: [[META8:![0-9]+]], scopeLine: 201, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], templateParams: [[META2]])
; CHECK: [[META6]] = !DIFile(filename: "header.hpp", directory: {{.*}})
; CHECK: [[META7]] = !DINamespace(name: "__spirv", scope: null)
; CHECK: [[META8]] = distinct !DISubroutineType(types: [[META2]])
; CHECK: [[DBG9]] = !DILocation(line: 201, column: 1, scope: [[DBG5]])
; CHECK: [[DBG10]] = distinct !DISubprogram(name: "getGlobalInvocationId<1>", linkageName: "_ZN7__spirv21getGlobalInvocationIdILi1EEEmv", scope: [[META7]], file: [[META6]], line: 201, type: [[META8]], scopeLine: 201, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], templateParams: [[META2]])
; CHECK: [[DBG11]] = !DILocation(line: 201, column: 1, scope: [[DBG10]])
;.
Loading