Skip to content

Commit f05b58a

Browse files
boxu-zhangArtem-B
authored andcommitted
[clang] Support '-fgpu-default-stream=per-thread' for NVIDIA CUDA
I'm using clang to compile CUDA code. And just found that clang doesn't support the per-thread stream option for NV CUDA. I don't know if there is another solution. Reviewed By: tra Differential Revision: https://reviews.llvm.org/D154822
1 parent 61962aa commit f05b58a

File tree

4 files changed

+18
-3
lines changed

4 files changed

+18
-3
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -358,9 +358,13 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
358358
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
359359
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
360360
std::string KernelLaunchAPI = "LaunchKernel";
361-
if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream ==
362-
LangOptions::GPUDefaultStreamKind::PerThread)
363-
KernelLaunchAPI = KernelLaunchAPI + "_spt";
361+
if (CGF.getLangOpts().GPUDefaultStream ==
362+
LangOptions::GPUDefaultStreamKind::PerThread) {
363+
if (CGF.getLangOpts().HIP)
364+
KernelLaunchAPI = KernelLaunchAPI + "_spt";
365+
else if (CGF.getLangOpts().CUDA)
366+
KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
367+
}
364368
auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
365369
IdentifierInfo &cudaLaunchKernelII =
366370
CGM.getContext().Idents.get(LaunchKernelName);

clang/lib/Frontend/InitPreprocessor.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,9 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
574574
Builder.defineMacro("__CLANG_RDC__");
575575
if (!LangOpts.HIP)
576576
Builder.defineMacro("__CUDA__");
577+
if (LangOpts.GPUDefaultStream ==
578+
LangOptions::GPUDefaultStreamKind::PerThread)
579+
Builder.defineMacro("CUDA_API_PER_THREAD_DEFAULT_STREAM");
577580
}
578581
if (LangOpts.HIP) {
579582
Builder.defineMacro("__HIP__");

clang/test/CodeGenCUDA/Inputs/cuda.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
5858
extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
5959
dim3 blockDim, void **args,
6060
size_t sharedMem, cudaStream_t stream);
61+
extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
62+
dim3 blockDim, void **args,
63+
size_t sharedMem, cudaStream_t stream);
64+
6165
#endif
6266

6367
extern "C" __device__ int printf(const char*, ...);

clang/test/CodeGenCUDA/kernel-call.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK
33
// RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \
44
// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
5+
// RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \
6+
// RUN: -fgpu-default-stream=per-thread -DCUDA_API_PER_THREAD_DEFAULT_STREAM \
7+
// RUN: | FileCheck %s --check-prefixes=CUDA-PTH,CHECK
58
// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
69
// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK
710
// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
@@ -25,6 +28,7 @@
2528
// CUDA-OLD: call{{.*}}cudaLaunch
2629
// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
2730
// CUDA-NEW: call{{.*}}cudaLaunchKernel
31+
// CUDA-PTH: call{{.*}}cudaLaunchKernel_ptsz
2832
__global__ void g1(int x) {}
2933

3034
// CHECK-LABEL: define{{.*}}main

0 commit comments

Comments
 (0)