Skip to content

Commit c62214d

Browse files
committed
[CUDA] add support for the new kernel launch API in CUDA-9.2+.
Instead of calling CUDA runtime to arrange function arguments, the new API constructs arguments in a local array and the kernels are launched with __cudaLaunchKernel(). The old API has been deprecated and is expected to go away in the next CUDA release. Differential Revision: https://reviews.llvm.org/D57488 llvm-svn: 352799
1 parent 8fa28a0 commit c62214d

File tree

14 files changed

+250
-46
lines changed

14 files changed

+250
-46
lines changed

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7143,7 +7143,7 @@ def err_kern_type_not_void_return : Error<
71437143
def err_kern_is_nonstatic_method : Error<
71447144
"kernel function %0 must be a free function or static member function">;
71457145
def err_config_scalar_return : Error<
7146-
"CUDA special function 'cudaConfigureCall' must have scalar return type">;
7146+
"CUDA special function '%0' must have scalar return type">;
71477147
def err_kern_call_not_global_function : Error<
71487148
"kernel call to non-global function %0">;
71497149
def err_global_call_not_config : Error<

clang/include/clang/Sema/Sema.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10348,6 +10348,11 @@ class Sema {
1034810348
/// Copies target attributes from the template TD to the function FD.
1034910349
void inheritCUDATargetAttrs(FunctionDecl *FD, const FunctionTemplateDecl &TD);
1035010350

10351+
/// Returns the name of the launch configuration function. This is the name
10352+
/// of the function that will be called to configure kernel call, with the
10353+
/// parameters specified via <<<>>>.
10354+
std::string getCudaConfigureFuncName() const;
10355+
1035110356
/// \name Code completion
1035210357
//@{
1035310358
/// Describes the context in which code completion occurs.

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 106 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "CodeGenFunction.h"
1616
#include "CodeGenModule.h"
1717
#include "clang/AST/Decl.h"
18+
#include "clang/Basic/Cuda.h"
19+
#include "clang/CodeGen/CodeGenABITypes.h"
1820
#include "clang/CodeGen/ConstantInitBuilder.h"
1921
#include "llvm/IR/BasicBlock.h"
2022
#include "llvm/IR/Constants.h"
@@ -102,7 +104,8 @@ class CGNVCUDARuntime : public CGCUDARuntime {
102104
return DummyFunc;
103105
}
104106

105-
void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
107+
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
108+
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
106109

107110
public:
108111
CGNVCUDARuntime(CodeGenModule &CGM);
@@ -187,11 +190,110 @@ llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
187190
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
188191
FunctionArgList &Args) {
189192
EmittedKernels.push_back(CGF.CurFn);
190-
emitDeviceStubBody(CGF, Args);
193+
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
194+
CudaFeature::CUDA_USES_NEW_LAUNCH))
195+
emitDeviceStubBodyNew(CGF, Args);
196+
else
197+
emitDeviceStubBodyLegacy(CGF, Args);
191198
}
192199

193-
void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
194-
FunctionArgList &Args) {
200+
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
201+
// array and kernels are launched using cudaLaunchKernel().
202+
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
203+
FunctionArgList &Args) {
204+
// Build the shadow stack entry at the very start of the function.
205+
206+
// Calculate amount of space we will need for all arguments. If we have no
207+
// args, allocate a single pointer so we still have a valid pointer to the
208+
// argument array that we can pass to runtime, even if it will be unused.
209+
Address KernelArgs = CGF.CreateTempAlloca(
210+
VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
211+
llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
212+
// Store pointers to the arguments in a locally allocated launch_args.
213+
for (unsigned i = 0; i < Args.size(); ++i) {
214+
llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
215+
llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
216+
CGF.Builder.CreateDefaultAlignedStore(
217+
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
218+
}
219+
220+
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
221+
222+
// Lookup cudaLaunchKernel function.
223+
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
224+
// void **args, size_t sharedMem,
225+
// cudaStream_t stream);
226+
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
227+
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
228+
IdentifierInfo &cudaLaunchKernelII =
229+
CGM.getContext().Idents.get("cudaLaunchKernel");
230+
FunctionDecl *cudaLaunchKernelFD = nullptr;
231+
for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
232+
if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
233+
cudaLaunchKernelFD = FD;
234+
}
235+
236+
if (cudaLaunchKernelFD == nullptr) {
237+
CGM.Error(CGF.CurFuncDecl->getLocation(),
238+
"Can't find declaration for cudaLaunchKernel()");
239+
return;
240+
}
241+
// Create temporary dim3 grid_dim, block_dim.
242+
ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
243+
QualType Dim3Ty = GridDimParam->getType();
244+
Address GridDim =
245+
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
246+
Address BlockDim =
247+
CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
248+
Address ShmemSize =
249+
CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
250+
Address Stream =
251+
CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
252+
llvm::Constant *cudaPopConfigFn = CGM.CreateRuntimeFunction(
253+
llvm::FunctionType::get(IntTy,
254+
{/*gridDim=*/GridDim.getType(),
255+
/*blockDim=*/BlockDim.getType(),
256+
/*ShmemSize=*/ShmemSize.getType(),
257+
/*Stream=*/Stream.getType()},
258+
/*isVarArg=*/false),
259+
"__cudaPopCallConfiguration");
260+
261+
CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
262+
{GridDim.getPointer(), BlockDim.getPointer(),
263+
ShmemSize.getPointer(), Stream.getPointer()});
264+
265+
// Emit the call to cudaLaunch
266+
llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
267+
CallArgList LaunchKernelArgs;
268+
LaunchKernelArgs.add(RValue::get(Kernel),
269+
cudaLaunchKernelFD->getParamDecl(0)->getType());
270+
LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
271+
LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
272+
LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
273+
cudaLaunchKernelFD->getParamDecl(3)->getType());
274+
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
275+
cudaLaunchKernelFD->getParamDecl(4)->getType());
276+
LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
277+
cudaLaunchKernelFD->getParamDecl(5)->getType());
278+
279+
QualType QT = cudaLaunchKernelFD->getType();
280+
QualType CQT = QT.getCanonicalType();
281+
llvm::Type *Ty = CGM.getTypes().ConvertFunctionType(CQT, cudaLaunchKernelFD);
282+
llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
283+
284+
const CGFunctionInfo &FI =
285+
CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
286+
llvm::Constant *cudaLaunchKernelFn =
287+
CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
288+
CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
289+
LaunchKernelArgs);
290+
CGF.EmitBranch(EndBlock);
291+
292+
CGF.EmitBlock(EndBlock);
293+
}
294+
295+
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
296+
FunctionArgList &Args) {
195297
// Emit a call to cudaSetupArgument for each arg in Args.
196298
llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
197299
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");

clang/lib/Headers/__clang_cuda_runtime_wrapper.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,5 +426,15 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
426426
#pragma pop_macro("__USE_FAST_MATH__")
427427
#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
428428

429+
// CUDA runtime uses this undocumented function to access kernel launch
430+
// configuration. The declaration is in crt/device_functions.h but that file
431+
// includes a lot of other stuff we don't want. Instead, we'll provide our own
432+
// declaration for it here.
433+
#if CUDA_VERSION >= 9020
434+
extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
435+
size_t sharedMem = 0,
436+
void *stream = 0);
437+
#endif
438+
429439
#endif // __CUDA__
430440
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__

clang/lib/Sema/SemaCUDA.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "clang/AST/ASTContext.h"
1414
#include "clang/AST/Decl.h"
1515
#include "clang/AST/ExprCXX.h"
16+
#include "clang/Basic/Cuda.h"
1617
#include "clang/Lex/Preprocessor.h"
1718
#include "clang/Sema/Lookup.h"
1819
#include "clang/Sema/Sema.h"
@@ -41,9 +42,8 @@ ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
4142
SourceLocation GGGLoc) {
4243
FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl();
4344
if (!ConfigDecl)
44-
return ExprError(
45-
Diag(LLLLoc, diag::err_undeclared_var_use)
46-
<< (getLangOpts().HIP ? "hipConfigureCall" : "cudaConfigureCall"));
45+
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
46+
<< getCudaConfigureFuncName());
4747
QualType ConfigQTy = ConfigDecl->getType();
4848

4949
DeclRefExpr *ConfigDR = new (Context)
@@ -957,3 +957,16 @@ void Sema::inheritCUDATargetAttrs(FunctionDecl *FD,
957957
copyAttrIfPresent<CUDAHostAttr>(*this, FD, TemplateFD);
958958
copyAttrIfPresent<CUDADeviceAttr>(*this, FD, TemplateFD);
959959
}
960+
961+
std::string Sema::getCudaConfigureFuncName() const {
962+
if (getLangOpts().HIP)
963+
return "hipConfigureCall";
964+
965+
// New CUDA kernel launch sequence.
966+
if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(),
967+
CudaFeature::CUDA_USES_NEW_LAUNCH))
968+
return "__cudaPushCallConfiguration";
969+
970+
// Legacy CUDA kernel configuration call
971+
return "cudaConfigureCall";
972+
}

clang/lib/Sema/SemaDecl.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9146,13 +9146,12 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
91469146

91479147
if (getLangOpts().CUDA) {
91489148
IdentifierInfo *II = NewFD->getIdentifier();
9149-
if (II &&
9150-
II->isStr(getLangOpts().HIP ? "hipConfigureCall"
9151-
: "cudaConfigureCall") &&
9149+
if (II && II->isStr(getCudaConfigureFuncName()) &&
91529150
!NewFD->isInvalidDecl() &&
91539151
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
91549152
if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
9155-
Diag(NewFD->getLocation(), diag::err_config_scalar_return);
9153+
Diag(NewFD->getLocation(), diag::err_config_scalar_return)
9154+
<< getCudaConfigureFuncName();
91569155
Context.setcudaConfigureCallDecl(NewFD);
91579156
}
91589157

clang/test/CodeGenCUDA/Inputs/cuda.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,20 @@ struct dim3 {
1515
};
1616

1717
typedef struct cudaStream *cudaStream_t;
18-
18+
typedef enum cudaError {} cudaError_t;
1919
#ifdef __HIP__
2020
int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
2121
cudaStream_t stream = 0);
2222
#else
23-
int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
24-
cudaStream_t stream = 0);
23+
extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
24+
size_t sharedSize = 0,
25+
cudaStream_t stream = 0);
26+
extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
27+
size_t sharedSize = 0,
28+
cudaStream_t stream = 0);
29+
extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
30+
dim3 blockDim, void **args,
31+
size_t sharedMem, cudaStream_t stream);
2532
#endif
2633

2734
extern "C" __device__ int printf(const char*, ...);

clang/test/CodeGenCUDA/device-stub.cu

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,36 @@
11
// RUN: echo "GPU binary would be here" > %t
22
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
3-
// RUN: -fcuda-include-gpubinary %t -o - \
4-
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC
3+
// RUN: -target-sdk-version=8.0 -fcuda-include-gpubinary %t -o - \
4+
// RUN: | FileCheck -allow-deprecated-dag-overlap %s \
5+
// RUN: --check-prefixes=ALL,NORDC,CUDA,CUDANORDC,CUDA-OLD
56
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
6-
// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \
7-
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS
7+
// RUN: -target-sdk-version=8.0 -fcuda-include-gpubinary %t \
8+
// RUN: -o - -DNOGLOBALS \
9+
// RUN: | FileCheck -allow-deprecated-dag-overlap %s \
10+
// RUN: -check-prefixes=NOGLOBALS,CUDANOGLOBALS
811
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
9-
// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - \
10-
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,CUDA,CUDARDC
11-
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
12+
// RUN: -target-sdk-version=8.0 -fgpu-rdc -fcuda-include-gpubinary %t \
13+
// RUN: -o - \
14+
// RUN: | FileCheck -allow-deprecated-dag-overlap %s \
15+
// RUN: --check-prefixes=ALL,RDC,CUDA,CUDARDC,CUDA-OLD
16+
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
17+
// RUN: -target-sdk-version=8.0 -o - \
18+
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
19+
20+
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
21+
// RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \
22+
// RUN: | FileCheck %s -allow-deprecated-dag-overlap \
23+
// RUN: --check-prefixes=ALL,NORDC,CUDA,CUDANORDC,CUDA-NEW
24+
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
25+
// RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - -DNOGLOBALS \
26+
// RUN: | FileCheck -allow-deprecated-dag-overlap %s \
27+
// RUN: --check-prefixes=NOGLOBALS,CUDANOGLOBALS
28+
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
29+
// RUN: -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
30+
// RUN: | FileCheck %s -allow-deprecated-dag-overlap \
31+
// RUN: --check-prefixes=ALL,RDC,CUDA,CUDARDC,CUDA_NEW
32+
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
33+
// RUN: -target-sdk-version=9.2 -o - \
1234
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
1335

1436
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
@@ -103,15 +125,34 @@ void use_pointers() {
103125
// by a call to cudaLaunch.
104126

105127
// ALL: define{{.*}}kernelfunc
106-
// ALL: call{{.*}}[[PREFIX]]SetupArgument
107-
// ALL: call{{.*}}[[PREFIX]]SetupArgument
108-
// ALL: call{{.*}}[[PREFIX]]SetupArgument
109-
// ALL: call{{.*}}[[PREFIX]]Launch
128+
129+
// New launch sequence stores arguments into local buffer and passes array of
130+
// pointers to them directly to cudaLaunchKernel
131+
// CUDA-NEW: alloca
132+
// CUDA-NEW: store
133+
// CUDA-NEW: store
134+
// CUDA-NEW: store
135+
// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
136+
// CUDA-NEW: call{{.*}}cudaLaunchKernel
137+
138+
// Legacy style launch sequence sets up arguments by passing them to
139+
// [cuda|hip]SetupArgument.
140+
// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
141+
// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
142+
// CUDA-OLD: call{{.*}}[[PREFIX]]SetupArgument
143+
// CUDA-OLD: call{{.*}}[[PREFIX]]Launch
144+
145+
// HIP: call{{.*}}[[PREFIX]]SetupArgument
146+
// HIP: call{{.*}}[[PREFIX]]SetupArgument
147+
// HIP: call{{.*}}[[PREFIX]]SetupArgument
148+
// HIP: call{{.*}}[[PREFIX]]Launch
110149
__global__ void kernelfunc(int i, int j, int k) {}
111150

112151
// Test that we've built correct kernel launch sequence.
113152
// ALL: define{{.*}}hostfunc
114-
// ALL: call{{.*}}[[PREFIX]]ConfigureCall
153+
// CUDA-OLD: call{{.*}}[[PREFIX]]ConfigureCall
154+
// CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
155+
// HIP: call{{.*}}[[PREFIX]]ConfigureCall
115156
// ALL: call{{.*}}kernelfunc
116157
void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
117158
#endif

clang/test/CodeGenCUDA/kernel-args-alignment.cu

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1-
// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \
2-
// RUN: FileCheck -check-prefix HOST -check-prefix CHECK %s
1+
// New CUDA kernel launch sequence does not require explicit specification of
2+
// size/offset for each argument, so only the old way is tested.
3+
//
4+
// RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm \
5+
// RUN: -target-sdk-version=8.0 -o - %s \
6+
// RUN: | FileCheck -check-prefixes=HOST-OLD,CHECK %s
37

48
// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \
5-
// RUN: -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s
9+
// RUN: -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,CHECK %s
610

711
#include "Inputs/cuda.h"
812

@@ -27,9 +31,9 @@ static_assert(alignof(S) == 8, "Unexpected alignment.");
2731
// 1. offset 0, width 1
2832
// 2. offset 8 (because alignof(S) == 8), width 16
2933
// 3. offset 24, width 8
30-
// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
31-
// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
32-
// HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
34+
// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 1, i64 0)
35+
// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 16, i64 8)
36+
// HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
3337

3438
// DEVICE-LABEL: @_Z6kernelc1SPi
3539
// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32*

clang/test/CodeGenCUDA/kernel-call.cu

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,27 @@
1-
// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK
2-
// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK
1+
// RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \
2+
// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK
3+
// RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \
4+
// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
5+
// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
6+
// RUN: | FileCheck %s --check-prefixes=HIP,CHECK
37

48

59
#include "Inputs/cuda.h"
610

711
// CHECK-LABEL: define{{.*}}g1
812
// HIP: call{{.*}}hipSetupArgument
913
// HIP: call{{.*}}hipLaunchByPtr
10-
// CUDA: call{{.*}}cudaSetupArgument
11-
// CUDA: call{{.*}}cudaLaunch
14+
// CUDA-OLD: call{{.*}}cudaSetupArgument
15+
// CUDA-OLD: call{{.*}}cudaLaunch
16+
// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
17+
// CUDA-NEW: call{{.*}}cudaLaunchKernel
1218
__global__ void g1(int x) {}
1319

1420
// CHECK-LABEL: define{{.*}}main
1521
int main(void) {
1622
// HIP: call{{.*}}hipConfigureCall
17-
// CUDA: call{{.*}}cudaConfigureCall
23+
// CUDA-OLD: call{{.*}}cudaConfigureCall
24+
// CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
1825
// CHECK: icmp
1926
// CHECK: br
2027
// CHECK: call{{.*}}g1

0 commit comments

Comments
 (0)