Skip to content

Commit 97f3be2

Browse files
authored
[CUDA][HIP] Improve variable registration with the new driver (#73177)
Summary: This patch adds support for registering texture / surface variables from CUDA / HIP. Additionally, we now properly track the `extern` and `const` flags that are also used in these runtime functions. This does not implement the `managed` variables yet as those seem to require some extra handling I'm not familiar with. The issue is that the current offload entry isn't large enough to carry size and alignment information along with an extra global.
1 parent fb35bb4 commit 97f3be2

File tree

8 files changed

+221
-101
lines changed

8 files changed

+221
-101
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,26 +1132,39 @@ void CGNVCUDARuntime::createOffloadingEntries() {
11321132
for (KernelInfo &I : EmittedKernels)
11331133
llvm::offloading::emitOffloadingEntry(
11341134
M, KernelHandles[I.Kernel->getName()],
1135-
getDeviceSideName(cast<NamedDecl>(I.D)), 0,
1136-
DeviceVarFlags::OffloadGlobalEntry, Section);
1135+
getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1136+
llvm::offloading::OffloadGlobalEntry, Section);
11371137

11381138
for (VarInfo &I : DeviceVars) {
11391139
uint64_t VarSize =
11401140
CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1141+
int32_t Flags =
1142+
(I.Flags.isExtern()
1143+
? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1144+
: 0) |
1145+
(I.Flags.isConstant()
1146+
? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1147+
: 0) |
1148+
(I.Flags.isNormalized()
1149+
? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1150+
: 0);
11411151
if (I.Flags.getKind() == DeviceVarFlags::Variable) {
11421152
llvm::offloading::emitOffloadingEntry(
11431153
M, I.Var, getDeviceSideName(I.D), VarSize,
1144-
I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry
1145-
: DeviceVarFlags::OffloadGlobalEntry,
1146-
Section);
1154+
(I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1155+
: llvm::offloading::OffloadGlobalEntry) |
1156+
Flags,
1157+
/*Data=*/0, Section);
11471158
} else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
11481159
llvm::offloading::emitOffloadingEntry(
11491160
M, I.Var, getDeviceSideName(I.D), VarSize,
1150-
DeviceVarFlags::OffloadGlobalSurfaceEntry, Section);
1161+
llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1162+
I.Flags.getSurfTexType(), Section);
11511163
} else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
11521164
llvm::offloading::emitOffloadingEntry(
11531165
M, I.Var, getDeviceSideName(I.D), VarSize,
1154-
DeviceVarFlags::OffloadGlobalTextureEntry, Section);
1166+
llvm::offloading::OffloadGlobalTextureEntry | Flags,
1167+
I.Flags.getSurfTexType(), Section);
11551168
}
11561169
}
11571170
}

clang/lib/CodeGen/CGCUDARuntime.h

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "clang/AST/GlobalDecl.h"
1919
#include "llvm/ADT/StringRef.h"
20+
#include "llvm/Frontend/Offloading/Utility.h"
2021
#include "llvm/IR/GlobalValue.h"
2122

2223
namespace llvm {
@@ -52,19 +53,6 @@ class CGCUDARuntime {
5253
Texture, // Builtin texture
5354
};
5455

55-
/// The kind flag for an offloading entry.
56-
enum OffloadEntryKindFlag : uint32_t {
57-
/// Mark the entry as a global entry. This indicates the presense of a
58-
/// kernel if the size field is zero and a variable otherwise.
59-
OffloadGlobalEntry = 0x0,
60-
/// Mark the entry as a managed global variable.
61-
OffloadGlobalManagedEntry = 0x1,
62-
/// Mark the entry as a surface variable.
63-
OffloadGlobalSurfaceEntry = 0x2,
64-
/// Mark the entry as a texture variable.
65-
OffloadGlobalTextureEntry = 0x3,
66-
};
67-
6856
private:
6957
unsigned Kind : 2;
7058
unsigned Extern : 1;

clang/test/CodeGenCUDA/offloading-entries.cu

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,31 +17,47 @@
1717
//.
1818
// CUDA: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
1919
// CUDA: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
20-
// CUDA: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
21-
// CUDA: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
22-
// CUDA: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
23-
// CUDA: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
20+
// CUDA: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
21+
// CUDA: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
22+
// CUDA: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
23+
// CUDA: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
24+
// CUDA: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
25+
// CUDA: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries", align 1
26+
// CUDA: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
27+
// CUDA: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries", align 1
2428
//.
2529
// HIP: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
2630
// HIP: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
27-
// HIP: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
28-
// HIP: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
29-
// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
30-
// HIP: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
31+
// HIP: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
32+
// HIP: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
33+
// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
34+
// HIP: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
35+
// HIP: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
36+
// HIP: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "hip_offloading_entries", align 1
37+
// HIP: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
38+
// HIP: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "hip_offloading_entries", align 1
3139
//.
3240
// CUDA-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
3341
// CUDA-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
34-
// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
35-
// CUDA-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
36-
// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
37-
// CUDA-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
42+
// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
43+
// CUDA-COFF: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
44+
// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
45+
// CUDA-COFF: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
46+
// CUDA-COFF: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
47+
// CUDA-COFF: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries$OE", align 1
48+
// CUDA-COFF: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
49+
// CUDA-COFF: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries$OE", align 1
3850
//.
3951
// HIP-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
4052
// HIP-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
41-
// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
42-
// HIP-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
43-
// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
44-
// HIP-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
53+
// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
54+
// HIP-COFF: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
55+
// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
56+
// HIP-COFF: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
57+
// HIP-COFF: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
58+
// HIP-COFF: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "hip_offloading_entries$OE", align 1
59+
// HIP-COFF: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
60+
// HIP-COFF: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "hip_offloading_entries$OE", align 1
4561
//.
4662
// CUDA-LABEL: @_Z18__device_stub__foov(
4763
// CUDA-NEXT: entry:
@@ -72,34 +88,52 @@
7288
// HIP-COFF-NEXT: ret void
7389
//
7490
__global__ void foo() {}
91+
__device__ int var = 1;
92+
const __device__ int constant = 1;
93+
extern __device__ int external;
7594

76-
// CUDA-LABEL: @_Z18__device_stub__barv(
95+
// CUDA-LABEL: @_Z21__device_stub__kernelv(
7796
// CUDA-NEXT: entry:
78-
// CUDA-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
97+
// CUDA-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z21__device_stub__kernelv)
7998
// CUDA-NEXT: br label [[SETUP_END:%.*]]
8099
// CUDA: setup.end:
81100
// CUDA-NEXT: ret void
82101
//
83-
// HIP-LABEL: @_Z18__device_stub__barv(
102+
// HIP-LABEL: @_Z21__device_stub__kernelv(
84103
// HIP-NEXT: entry:
85-
// HIP-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv)
104+
// HIP-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z6kernelv)
86105
// HIP-NEXT: br label [[SETUP_END:%.*]]
87106
// HIP: setup.end:
88107
// HIP-NEXT: ret void
89108
//
90-
// CUDA-COFF-LABEL: @_Z18__device_stub__barv(
109+
// CUDA-COFF-LABEL: @_Z21__device_stub__kernelv(
91110
// CUDA-COFF-NEXT: entry:
92-
// CUDA-COFF-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
111+
// CUDA-COFF-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z21__device_stub__kernelv)
93112
// CUDA-COFF-NEXT: br label [[SETUP_END:%.*]]
94113
// CUDA-COFF: setup.end:
95114
// CUDA-COFF-NEXT: ret void
96115
//
97-
// HIP-COFF-LABEL: @_Z18__device_stub__barv(
116+
// HIP-COFF-LABEL: @_Z21__device_stub__kernelv(
98117
// HIP-COFF-NEXT: entry:
99-
// HIP-COFF-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv)
118+
// HIP-COFF-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z6kernelv)
100119
// HIP-COFF-NEXT: br label [[SETUP_END:%.*]]
101120
// HIP-COFF: setup.end:
102121
// HIP-COFF-NEXT: ret void
103122
//
104-
__global__ void bar() {}
105-
__device__ int x = 1;
123+
__global__ void kernel() { external = 1; }
124+
125+
struct surfaceReference { int desc; };
126+
127+
template <typename T, int dim = 1>
128+
struct __attribute__((device_builtin_surface_type)) surface : public surfaceReference {};
129+
130+
surface<void> surf;
131+
132+
struct textureReference {
133+
int desc;
134+
};
135+
136+
template <typename T, int dim = 1, int mode = 0>
137+
struct __attribute__((device_builtin_texture_type)) texture : public textureReference {};
138+
139+
texture<void> tex;

0 commit comments

Comments
 (0)