Skip to content

Commit c7bb288

Browse files
[SYCL][PTX][CUDA] Implicit global offset implementation (#1773)
This commit implements implicit global offset behavior for the kernels generated for the PI CUDA backend. This includes the following changes: * A new builtin `__builtin_ptx_implicit_offset` and intrinsic `int.nvvm.implicit.offset` for getting the global offset. For the `ptx-nvidiacl` this is used for implementing the `__spirv_GlobalOffset` builtin. * A new pass that iterates over the uses of the `int.nvvm.implicit.offset` intrinsic, replacing it with a new function parameter. It then moves up the call-tree, adjusting calls to functions with this new parameter by adding a similar parameter to callers without it and adding this parameter to the calls. An exception are entry points, which are instead cloned with the clone being given the new parameter and the original using an offset of `{0,0,0}` in all uses of the intrinsic or functions with the new parameter. Any entry points that are not cloned are invariant to the offset parameter. Additionally the PI CUDA backend now includes an offset parameter in the set of arguments for kernels. PI CUDA attempt to load the corresponding kernel both with and without the global offset parameter. If present, the kernel with the offset parameter is used only when a non-zero global offset is used.
1 parent 94b36ac commit c7bb288

File tree

17 files changed

+806
-25
lines changed

17 files changed

+806
-25
lines changed

clang/include/clang/Basic/BuiltinsNVPTX.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ BUILTIN(__nvvm_read_ptx_sreg_pm1, "i", "n")
8989
BUILTIN(__nvvm_read_ptx_sreg_pm2, "i", "n")
9090
BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n")
9191

92+
// SYCL
93+
BUILTIN(__builtin_ptx_implicit_offset, "Ui*", "nc")
94+
9295
// MISC
9396

9497
BUILTIN(__nvvm_prmt, "UiUiUiUi", "")

libclc/ptx-nvidiacl/libspirv/workitem/get_global_id.cl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@
99
#include <spirv/spirv.h>
1010

1111
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_x() {
12-
return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() + __spirv_LocalInvocationId_x();
12+
return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() +
13+
__spirv_LocalInvocationId_x() + __spirv_GlobalOffset_x();
1314
}
1415

1516
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_y() {
16-
return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() + __spirv_LocalInvocationId_y();
17+
return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() +
18+
__spirv_LocalInvocationId_y() + __spirv_GlobalOffset_y();
1719
}
1820

1921
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_z() {
20-
return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() + __spirv_LocalInvocationId_z();
22+
return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() +
23+
__spirv_LocalInvocationId_z() + __spirv_GlobalOffset_z();
2124
}

libclc/ptx-nvidiacl/libspirv/workitem/get_global_offset.cl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@
1111
// Compiler support is required to provide global offset on NVPTX.
1212

1313
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_x() {
14-
return 0;
14+
return __builtin_ptx_implicit_offset()[0];
1515
}
1616

1717
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_y() {
18-
return 0;
18+
return __builtin_ptx_implicit_offset()[1];
1919
}
2020

2121
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalOffset_z() {
22-
return 0;
22+
return __builtin_ptx_implicit_offset()[2];
2323
}

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4161,4 +4161,10 @@ foreach layout_a = ["row", "col"] in {
41614161
} // layout_b
41624162
} // layout_a
41634163

4164+
// SYCL
4165+
def int_nvvm_implicit_offset :
4166+
GCCBuiltin<"__builtin_ptx_implicit_offset">,
4167+
Intrinsic<[LLVMPointerType<llvm_i32_ty>], [],
4168+
[IntrNoMem, IntrSpeculatable]>;
4169+
41644170
} // let TargetPrefix = "nvvm"

llvm/lib/Target/NVPTX/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ set(NVPTXCodeGen_sources
3333
NVVMIntrRange.cpp
3434
NVVMReflect.cpp
3535
NVPTXProxyRegErasure.cpp
36+
SYCL/GlobalOffset.cpp
3637
SYCL/LocalAccessorToSharedMemory.cpp
3738
)
3839

llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
#include "NVPTXLowerAggrCopies.h"
1717
#include "NVPTXTargetObjectFile.h"
1818
#include "NVPTXTargetTransformInfo.h"
19-
#include "TargetInfo/NVPTXTargetInfo.h"
19+
#include "SYCL/GlobalOffset.h"
2020
#include "SYCL/LocalAccessorToSharedMemory.h"
21+
#include "TargetInfo/NVPTXTargetInfo.h"
2122
#include "llvm/ADT/STLExtras.h"
2223
#include "llvm/ADT/Triple.h"
2324
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -71,6 +72,7 @@ void initializeNVPTXLowerArgsPass(PassRegistry &);
7172
void initializeNVPTXLowerAllocaPass(PassRegistry &);
7273
void initializeNVPTXProxyRegErasurePass(PassRegistry &);
7374

75+
void initializeGlobalOffsetPass(PassRegistry &);
7476
void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
7577

7678
} // end namespace llvm
@@ -94,6 +96,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
9496
initializeNVPTXProxyRegErasurePass(PR);
9597

9698
// SYCL-specific passes, needed here to be available to `opt`.
99+
initializeGlobalOffsetPass(PR);
97100
initializeLocalAccessorToSharedMemoryPass(PR);
98101
}
99102

@@ -274,6 +277,7 @@ void NVPTXPassConfig::addIRPasses() {
274277

275278
if (getTM<NVPTXTargetMachine>().getTargetTriple().getOS() == Triple::CUDA &&
276279
getTM<NVPTXTargetMachine>().getTargetTriple().getEnvironment() == Triple::SYCLDevice) {
280+
addPass(createGlobalOffsetPass());
277281
addPass(createLocalAccessorToSharedMemoryPass());
278282
}
279283

0 commit comments

Comments
 (0)