Skip to content

Commit 4583f6d

Browse files
authored
[NVPTX] Switch front-ends and tests to ptx_kernel cc (#120806)
the `ptx_kernel` calling convention is a more idiomatic and standard way of specifying a NVPTX kernel than using the metadata which is not supposed to change the meaning of the program. Further, checking the calling convention is significantly faster than traversing the metadata, improving compile time. This change updates the clang and mlir frontends as well as the NVPTXCtorDtorLowering pass to emit kernels using the calling convention. In addition, this updates all NVPTX unit tests to use the calling convention as well.
1 parent db408ac commit 4583f6d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+305
-477
lines changed

clang/lib/CodeGen/Targets/NVPTX.cpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "ABIInfoImpl.h"
1010
#include "TargetInfo.h"
1111
#include "llvm/ADT/STLExtras.h"
12+
#include "llvm/IR/CallingConv.h"
1213
#include "llvm/IR/IntrinsicsNVPTX.h"
1314

1415
using namespace clang;
@@ -79,13 +80,11 @@ class NVPTXTargetCodeGenInfo : public TargetCodeGenInfo {
7980
// Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
8081
// resulting MDNode to the nvvm.annotations MDNode.
8182
static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
82-
int Operand,
83-
const SmallVectorImpl<int> &GridConstantArgs);
83+
int Operand);
8484

85-
static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
86-
int Operand) {
87-
addNVVMMetadata(GV, Name, Operand, SmallVector<int, 1>(0));
88-
}
85+
static void
86+
addGridConstantNVVMMetadata(llvm::GlobalValue *GV,
87+
const SmallVectorImpl<int> &GridConstantArgs);
8988

9089
private:
9190
static void emitBuiltinSurfTexDeviceCopy(CodeGenFunction &CGF, LValue Dst,
@@ -259,7 +258,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
259258
if (FD->hasAttr<OpenCLKernelAttr>()) {
260259
// OpenCL __kernel functions get kernel metadata
261260
// Create !{<func-ref>, metadata !"kernel", i32 1} node
262-
addNVVMMetadata(F, "kernel", 1);
261+
F->setCallingConv(llvm::CallingConv::PTX_Kernel);
263262
// And kernel functions are not subject to inlining
264263
F->addFnAttr(llvm::Attribute::NoInline);
265264
}
@@ -277,21 +276,21 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
277276
// For some reason arg indices are 1-based in NVVM
278277
GCI.push_back(IV.index() + 1);
279278
// Create !{<func-ref>, metadata !"kernel", i32 1} node
280-
addNVVMMetadata(F, "kernel", 1, GCI);
279+
F->setCallingConv(llvm::CallingConv::PTX_Kernel);
280+
addGridConstantNVVMMetadata(F, GCI);
281281
}
282282
if (CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>())
283283
M.handleCUDALaunchBoundsAttr(F, Attr);
284284
}
285285

286286
// Attach kernel metadata directly if compiling for NVPTX.
287287
if (FD->hasAttr<NVPTXKernelAttr>()) {
288-
addNVVMMetadata(F, "kernel", 1);
288+
F->setCallingConv(llvm::CallingConv::PTX_Kernel);
289289
}
290290
}
291291

292-
void NVPTXTargetCodeGenInfo::addNVVMMetadata(
293-
llvm::GlobalValue *GV, StringRef Name, int Operand,
294-
const SmallVectorImpl<int> &GridConstantArgs) {
292+
void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
293+
StringRef Name, int Operand) {
295294
llvm::Module *M = GV->getParent();
296295
llvm::LLVMContext &Ctx = M->getContext();
297296

@@ -302,6 +301,21 @@ void NVPTXTargetCodeGenInfo::addNVVMMetadata(
302301
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
303302
llvm::ConstantAsMetadata::get(
304303
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
304+
305+
// Append metadata to nvvm.annotations
306+
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
307+
}
308+
309+
void NVPTXTargetCodeGenInfo::addGridConstantNVVMMetadata(
310+
llvm::GlobalValue *GV, const SmallVectorImpl<int> &GridConstantArgs) {
311+
312+
llvm::Module *M = GV->getParent();
313+
llvm::LLVMContext &Ctx = M->getContext();
314+
315+
// Get "nvvm.annotations" metadata node
316+
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
317+
318+
SmallVector<llvm::Metadata *, 5> MDVals = {llvm::ConstantAsMetadata::get(GV)};
305319
if (!GridConstantArgs.empty()) {
306320
SmallVector<llvm::Metadata *, 10> GCM;
307321
for (int I : GridConstantArgs)
@@ -310,6 +324,7 @@ void NVPTXTargetCodeGenInfo::addNVVMMetadata(
310324
MDVals.append({llvm::MDString::get(Ctx, "grid_constant"),
311325
llvm::MDNode::get(Ctx, GCM)});
312326
}
327+
313328
// Append metadata to nvvm.annotations
314329
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
315330
}

clang/test/CodeGen/nvptx_attributes.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,14 @@
1010
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RET_ADDR]], align 8
1111
// CHECK-NEXT: store i32 1, ptr [[TMP0]], align 4
1212
// CHECK-NEXT: ret void
13+
//
1314
__attribute__((nvptx_kernel)) void foo(int *ret) {
1415
*ret = 1;
1516
}
1617

17-
// CHECK: !0 = !{ptr @foo, !"kernel", i32 1}
18+
//.
19+
// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx32,+sm_61" }
20+
//.
21+
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
22+
// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
23+
//.

clang/test/CodeGenCUDA/device-fun-linkage.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ template __device__ void func<int>();
1717
// RDC: define weak_odr void @_Z4funcIiEvv()
1818

1919
template __global__ void kernel<int>();
20-
// NORDC: define void @_Z6kernelIiEvv()
21-
// RDC: define weak_odr void @_Z6kernelIiEvv()
20+
// NORDC: define ptx_kernel void @_Z6kernelIiEvv()
21+
// RDC: define weak_odr ptx_kernel void @_Z6kernelIiEvv()
2222

2323
// Ensure that unused static device function is eliminated
2424
static __device__ void static_func() {}
@@ -28,5 +28,5 @@ static __device__ void static_func() {}
2828
// Ensure that kernel function has external or weak_odr
2929
// linkage regardless static specifier
3030
static __global__ void static_kernel() {}
31-
// NORDC: define void @_ZL13static_kernelv()
32-
// RDC: define weak_odr void @_ZL13static_kernelv[[FILEID:.*]]()
31+
// NORDC: define ptx_kernel void @_ZL13static_kernelv()
32+
// RDC: define weak_odr ptx_kernel void @_ZL13static_kernelv[[FILEID:.*]]()

clang/test/CodeGenCUDA/grid-constant.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ void foo() {
2121
}
2222
//.
2323
//.
24-
// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"kernel", i32 1, !"grid_constant", [[META1:![0-9]+]]}
24+
// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"grid_constant", [[META1:![0-9]+]]}
2525
// CHECK: [[META1]] = !{i32 1, i32 3}
26-
// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3:![0-9]+]]}
26+
// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"grid_constant", [[META3:![0-9]+]]}
2727
// CHECK: [[META3]] = !{i32 1}
28-
// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3]]}
29-
// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"kernel", i32 1, !"grid_constant", [[META6:![0-9]+]]}
28+
// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"grid_constant", [[META3]]}
29+
// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"grid_constant", [[META6:![0-9]+]]}
3030
// CHECK: [[META6]] = !{i32 2}
3131
//.

clang/test/CodeGenCUDA/offload_via_llvm.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#define __OFFLOAD_VIA_LLVM__ 1
88
#include "Inputs/cuda.h"
99

10-
// HST-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_(
10+
// HST-LABEL: define dso_local ptx_kernel void @_Z18__device_stub__fooisPvS_(
1111
// HST-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
1212
// HST-NEXT: [[ENTRY:.*:]]
1313
// HST-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
@@ -50,7 +50,7 @@
5050
// HST: [[SETUP_END]]:
5151
// HST-NEXT: ret void
5252
//
53-
// DEV-LABEL: define dso_local void @_Z3fooisPvS_(
53+
// DEV-LABEL: define dso_local ptx_kernel void @_Z3fooisPvS_(
5454
// DEV-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
5555
// DEV-NEXT: [[ENTRY:.*:]]
5656
// DEV-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4

clang/test/CodeGenCUDA/ptx-kernels.cu

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
extern "C"
1111
__device__ void device_function() {}
1212

13-
// CHECK-LABEL: define{{.*}} void @global_function
13+
// CHECK-LABEL: define{{.*}} ptx_kernel void @global_function
1414
extern "C"
1515
__global__ void global_function() {
1616
// CHECK: call void @device_function
@@ -19,7 +19,7 @@ __global__ void global_function() {
1919

2020
// Make sure host-instantiated kernels are preserved on device side.
2121
template <typename T> __global__ void templated_kernel(T param) {}
22-
// CHECK-DAG: define{{.*}} void @_Z16templated_kernelIiEvT_(
22+
// CHECK-DAG: define{{.*}} ptx_kernel void @_Z16templated_kernelIiEvT_(
2323

2424
namespace {
2525
__global__ void anonymous_ns_kernel() {}
@@ -30,6 +30,3 @@ void host_function() {
3030
templated_kernel<<<0, 0>>>(0);
3131
anonymous_ns_kernel<<<0,0>>>();
3232
}
33-
34-
// CHECK: !{{[0-9]+}} = !{ptr @global_function, !"kernel", i32 1}
35-
// CHECK: !{{[0-9]+}} = !{ptr @_Z16templated_kernelIiEvT_, !"kernel", i32 1}

clang/test/CodeGenCUDA/usual-deallocators.cu

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ __host__ __device__ void tests_hd(void *t) {
109109
}
110110

111111
// Make sure that we've generated the kernel used by A::~A.
112-
// DEVICE-LABEL: define void @_Z1fIiEvT_
112+
// DEVICE-LABEL: define ptx_kernel void @_Z1fIiEvT_
113113

114114
// Make sure we've picked deallocator for the correct side of compilation.
115115

@@ -147,5 +147,3 @@ __host__ __device__ void tests_hd(void *t) {
147147
// COMMON-LABEL: define linkonce_odr void @_ZN8H1H2D1D2dlEPv(ptr noundef %0)
148148
// DEVICE: call void @dev_fn()
149149
// HOST: call void @host_fn()
150-
151-
// DEVICE: !0 = !{ptr @_Z1fIiEvT_, !"kernel", i32 1}

clang/test/CodeGenOpenCL/ptx-calls.cl

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,5 @@ void device_function() {
77
__kernel void kernel_function() {
88
device_function();
99
}
10-
// CHECK-LABEL: define{{.*}} spir_kernel void @kernel_function()
10+
// CHECK-LABEL: define{{.*}} ptx_kernel void @kernel_function()
1111
// CHECK: call void @device_function()
12-
// CHECK: !{{[0-9]+}} = !{ptr @kernel_function, !"kernel", i32 1}
13-

clang/test/CodeGenOpenCL/ptx-kernels.cl

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,4 @@ void device_function() {
66

77
__kernel void kernel_function() {
88
}
9-
// CHECK-LABEL: define{{.*}} spir_kernel void @kernel_function()
10-
11-
// CHECK: !{{[0-9]+}} = !{ptr @kernel_function, !"kernel", i32 1}
9+
// CHECK-LABEL: define{{.*}} ptx_kernel void @kernel_function()

clang/test/CodeGenOpenCL/reflect.cl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ bool device_function() {
1212
return __nvvm_reflect("__CUDA_ARCH") >= 700;
1313
}
1414

15-
// CHECK-LABEL: define dso_local spir_kernel void @kernel_function(
16-
// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
15+
// CHECK-LABEL: define dso_local ptx_kernel void @kernel_function(
16+
// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
1717
// CHECK-NEXT: entry:
1818
// CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4
1919
// CHECK-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4
@@ -26,3 +26,9 @@ bool device_function() {
2626
__kernel void kernel_function(__global int *i) {
2727
*i = device_function();
2828
}
29+
//.
30+
// CHECK: [[META3]] = !{i32 1}
31+
// CHECK: [[META4]] = !{!"none"}
32+
// CHECK: [[META5]] = !{!"int*"}
33+
// CHECK: [[META6]] = !{!""}
34+
//.

clang/test/Headers/gpuintrin.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
4545
// AMDGPU-NEXT: unreachable
4646
//
47-
// NVPTX-LABEL: define protected void @foo(
47+
// NVPTX-LABEL: define protected ptx_kernel void @foo(
4848
// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
4949
// NVPTX-NEXT: [[ENTRY:.*:]]
5050
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]]

llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "MCTargetDesc/NVPTXBaseInfo.h"
1515
#include "NVPTX.h"
1616
#include "llvm/ADT/StringExtras.h"
17+
#include "llvm/IR/CallingConv.h"
1718
#include "llvm/IR/Constants.h"
1819
#include "llvm/IR/Function.h"
1920
#include "llvm/IR/GlobalVariable.h"
@@ -49,39 +50,34 @@ static std::string getHash(StringRef Str) {
4950
return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
5051
}
5152

52-
static void addKernelMetadata(Module &M, GlobalValue *GV) {
53+
static void addKernelMetadata(Module &M, Function *F) {
5354
llvm::LLVMContext &Ctx = M.getContext();
5455

5556
// Get "nvvm.annotations" metadata node.
5657
llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
5758

58-
llvm::Metadata *KernelMDVals[] = {
59-
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "kernel"),
60-
llvm::ConstantAsMetadata::get(
61-
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
62-
6359
// This kernel is only to be called single-threaded.
6460
llvm::Metadata *ThreadXMDVals[] = {
65-
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidx"),
61+
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
6662
llvm::ConstantAsMetadata::get(
6763
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
6864
llvm::Metadata *ThreadYMDVals[] = {
69-
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidy"),
65+
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
7066
llvm::ConstantAsMetadata::get(
7167
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
7268
llvm::Metadata *ThreadZMDVals[] = {
73-
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidz"),
69+
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
7470
llvm::ConstantAsMetadata::get(
7571
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
7672

7773
llvm::Metadata *BlockMDVals[] = {
78-
llvm::ConstantAsMetadata::get(GV),
74+
llvm::ConstantAsMetadata::get(F),
7975
llvm::MDString::get(Ctx, "maxclusterrank"),
8076
llvm::ConstantAsMetadata::get(
8177
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
8278

8379
// Append metadata to nvvm.annotations.
84-
MD->addOperand(llvm::MDNode::get(Ctx, KernelMDVals));
80+
F->setCallingConv(CallingConv::PTX_Kernel);
8581
MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
8682
MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
8783
MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));

llvm/lib/Target/NVPTX/NVPTXUtilities.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,11 +311,13 @@ std::optional<unsigned> getMaxNReg(const Function &F) {
311311
}
312312

313313
bool isKernelFunction(const Function &F) {
314+
if (F.getCallingConv() == CallingConv::PTX_Kernel)
315+
return true;
316+
314317
if (const auto X = findOneNVVMAnnotation(&F, "kernel"))
315318
return (*X == 1);
316319

317-
// There is no NVVM metadata, check the calling convention
318-
return F.getCallingConv() == CallingConv::PTX_Kernel;
320+
return false;
319321
}
320322

321323
MaybeAlign getAlign(const Function &F, unsigned Index) {

llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
44
target triple = "nvptx64-nvidia-cuda"
55

6-
define i32 @daorder(i32 %n) {
6+
define ptx_kernel i32 @daorder(i32 %n) {
77
; CHECK-LABEL: for function 'daorder'
88
entry:
99
%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -43,6 +43,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
4343
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
4444
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
4545
declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
46-
47-
!nvvm.annotations = !{!0}
48-
!0 = !{ptr @daorder, !"kernel", i32 1}

llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
44
target triple = "nvptx64-nvidia-cuda"
55

66
; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
7-
define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
7+
define ptx_kernel i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
88
; CHECK-LABEL: for function 'no_diverge'
99
entry:
1010
%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -27,7 +27,7 @@ merge:
2727
; if (threadIdx.x < 5) // divergent: data dependent
2828
; c = b;
2929
; return c; // c is divergent: sync dependent
30-
define i32 @sync(i32 %a, i32 %b) {
30+
define ptx_kernel i32 @sync(i32 %a, i32 %b) {
3131
; CHECK-LABEL: for function 'sync'
3232
bb1:
3333
%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -49,7 +49,7 @@ bb3:
4949
; }
5050
; // c here is divergent because it is sync dependent on threadIdx.x >= 5
5151
; return c;
52-
define i32 @mixed(i32 %n, i32 %a, i32 %b) {
52+
define ptx_kernel i32 @mixed(i32 %n, i32 %a, i32 %b) {
5353
; CHECK-LABEL: for function 'mixed'
5454
bb1:
5555
%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
@@ -101,7 +101,7 @@ merge:
101101
; return i == 10 ? 0 : 1; // i here is divergent
102102
;
103103
; The i defined in the loop is used outside.
104-
define i32 @loop() {
104+
define ptx_kernel i32 @loop() {
105105
; CHECK-LABEL: for function 'loop'
106106
entry:
107107
%laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
@@ -149,7 +149,7 @@ else:
149149
}
150150

151151
; Verifies sync-dependence is computed correctly in the absense of loops.
152-
define i32 @sync_no_loop(i32 %arg) {
152+
define ptx_kernel i32 @sync_no_loop(i32 %arg) {
153153
; CHECK-LABEL: for function 'sync_no_loop'
154154
entry:
155155
%0 = add i32 %arg, 1
@@ -174,9 +174,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
174174
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
175175
declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
176176

177-
!nvvm.annotations = !{!0, !1, !2, !3, !4}
178-
!0 = !{ptr @no_diverge, !"kernel", i32 1}
179-
!1 = !{ptr @sync, !"kernel", i32 1}
180-
!2 = !{ptr @mixed, !"kernel", i32 1}
181-
!3 = !{ptr @loop, !"kernel", i32 1}
182-
!4 = !{ptr @sync_no_loop, !"kernel", i32 1}

0 commit comments

Comments
 (0)