Skip to content

Commit 07ed818

Browse files
authored
[OpenMP] Replace nvvm.annotation usage with kernel calling conventions (#122320)
Specifying a kernel with the `ptx_kernel` or `amdgpu_kernel` calling convention is a more idiomatic and compile-time performant than using the `nvvm.annoation !"kernel"` metadata. Transition OMPIRBuilder to use calling conventions for PTX kernels and no longer emit `nvvm.annoation`. Update OpenMPOpt to work with kernels specified via calling convention as well as metadata. Update OpenMP tests to use the calling conventions.
1 parent d92bac8 commit 07ed818

34 files changed

+606
-2105
lines changed

clang/test/OpenMP/assumes_include_nvptx.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111

1212
// TODO: Think about teaching the OMPIRBuilder about default attributes as well so the __kmpc* declarations are annotated.
1313

14-
// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
14+
// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
1515
// CHECK: call i32 @__kmpc_target_init(
1616
// CHECK: declare noundef float @_Z3sinf(float noundef) [[attr1:#[0-9]*]]
1717
// CHECK: declare void @__kmpc_target_deinit(
18-
// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
18+
// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
1919
// CHECK: %call = call noundef double @_Z3sind(double noundef 0.000000e+00) [[attr2:#[0-9]]]
2020
// CHECK: declare noundef double @_Z3sind(double noundef) [[attr1]]
2121

clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ int foo(int n, double *ptr) {
9090
ptr[0]++;
9191
}
9292

93-
// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
93+
// TCHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
9494
// TCHECK: [[DYN_PTR_ADDR:%.+]] = alloca ptr,
9595
// TCHECK: [[PTR_ADDR:%.+]] = alloca ptr,
9696
// TCHECK-NOT: alloca ptr,

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6468,6 +6468,8 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
64686468
OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
64696469
if (T.isAMDGCN())
64706470
OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
6471+
else if (T.isNVPTX())
6472+
OutlinedFn->setCallingConv(CallingConv::PTX_Kernel);
64716473
}
64726474
}
64736475

@@ -9223,20 +9225,8 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
92239225
if (!Fn)
92249226
return;
92259227

9226-
Module &M = *(Fn->getParent());
9227-
LLVMContext &Ctx = M.getContext();
9228-
9229-
// Get "nvvm.annotations" metadata node.
9230-
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
9231-
9232-
Metadata *MDVals[] = {
9233-
ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
9234-
ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
9235-
// Append metadata to nvvm.annotations.
9236-
MD->addOperand(MDNode::get(Ctx, MDVals));
9237-
92389228
// Add a function attribute for the kernel.
9239-
Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
9229+
Fn->addFnAttr("kernel");
92409230
if (T.isAMDGCN())
92419231
Fn->addFnAttr("uniform-work-group-size", "true");
92429232
Fn->addFnAttr(Attribute::MustProgress);

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "llvm/Transforms/IPO/OpenMPOpt.h"
2121

22+
#include "llvm/ADT/DenseSet.h"
2223
#include "llvm/ADT/EnumeratedArray.h"
2324
#include "llvm/ADT/PostOrderIterator.h"
2425
#include "llvm/ADT/SetVector.h"
@@ -36,6 +37,7 @@
3637
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
3738
#include "llvm/IR/Assumptions.h"
3839
#include "llvm/IR/BasicBlock.h"
40+
#include "llvm/IR/CallingConv.h"
3941
#include "llvm/IR/Constants.h"
4042
#include "llvm/IR/DiagnosticInfo.h"
4143
#include "llvm/IR/Dominators.h"
@@ -5903,34 +5905,51 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) {
59035905
return Fn.hasFnAttribute("kernel");
59045906
}
59055907

5908+
static bool isKernelCC(Function &F) {
5909+
switch (F.getCallingConv()) {
5910+
default:
5911+
return false;
5912+
case CallingConv::PTX_Kernel:
5913+
case CallingConv::AMDGPU_KERNEL:
5914+
case CallingConv::SPIR_KERNEL:
5915+
return true;
5916+
}
5917+
}
5918+
59065919
KernelSet llvm::omp::getDeviceKernels(Module &M) {
59075920
// TODO: Create a more cross-platform way of determining device kernels.
5908-
NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
59095921
KernelSet Kernels;
59105922

5911-
if (!MD)
5912-
return Kernels;
5923+
DenseSet<const Function *> SeenKernels;
5924+
auto ProcessKernel = [&](Function &KF) {
5925+
if (SeenKernels.insert(&KF).second) {
5926+
// We are only interested in OpenMP target regions. Others, such as
5927+
// kernels generated by CUDA but linked together, are not interesting to
5928+
// this pass.
5929+
if (isOpenMPKernel(KF)) {
5930+
++NumOpenMPTargetRegionKernels;
5931+
Kernels.insert(&KF);
5932+
} else
5933+
++NumNonOpenMPTargetRegionKernels;
5934+
}
5935+
};
59135936

5914-
for (auto *Op : MD->operands()) {
5915-
if (Op->getNumOperands() < 2)
5916-
continue;
5917-
MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
5918-
if (!KindID || KindID->getString() != "kernel")
5919-
continue;
5937+
if (NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"))
5938+
for (auto *Op : MD->operands()) {
5939+
if (Op->getNumOperands() < 2)
5940+
continue;
5941+
MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
5942+
if (!KindID || KindID->getString() != "kernel")
5943+
continue;
59205944

5921-
Function *KernelFn =
5922-
mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
5923-
if (!KernelFn)
5924-
continue;
5945+
if (auto *KernelFn =
5946+
mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)))
5947+
ProcessKernel(*KernelFn);
5948+
}
59255949

5926-
// We are only interested in OpenMP target regions. Others, such as kernels
5927-
// generated by CUDA but linked together, are not interesting to this pass.
5928-
if (isOpenMPKernel(*KernelFn)) {
5929-
++NumOpenMPTargetRegionKernels;
5930-
Kernels.insert(KernelFn);
5931-
} else
5932-
++NumNonOpenMPTargetRegionKernels;
5933-
}
5950+
for (Function &F : M)
5951+
if (isKernelCC(F))
5952+
ProcessKernel(F);
59345953

59355954
return Kernels;
59365955
}

llvm/test/Transforms/OpenMP/always_inline_device.ll

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
; CHECK: @G = external global i8
1818
; CHECK: @kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
1919
;.
20-
define weak void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
20+
define weak ptx_kernel void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
2121
; CHECK: Function Attrs: norecurse nounwind
2222
; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
2323
; CHECK-NEXT: entry:
@@ -79,12 +79,10 @@ attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-wi
7979
attributes #2 = { convergent }
8080

8181
!omp_offload.info = !{!0}
82-
!nvvm.annotations = !{!1}
8382
!llvm.module.flags = !{!2, !3, !4, !5, !6}
8483
!llvm.ident = !{!7}
8584

8685
!0 = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
87-
!1 = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
8886
!2 = !{i32 1, !"wchar_size", i32 4}
8987
!3 = !{i32 7, !"openmp", i32 50}
9088
!4 = !{i32 7, !"openmp-device", i32 50}
@@ -97,11 +95,10 @@ attributes #2 = { convergent }
9795
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
9896
;.
9997
; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
100-
; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
101-
; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
102-
; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
103-
; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
104-
; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
105-
; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
106-
; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
98+
; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
99+
; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
100+
; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
101+
; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
102+
; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
103+
; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
107104
;.

llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,6 @@ define linkonce_odr hidden i8 @_ZStplIdESt7complexIT_ERKS2_S4_() local_unnamed_a
1313
ret i8 undef
1414
}
1515

16-
declare void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr
16+
declare ptx_kernel void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr
1717

1818
declare dso_local fastcc void @__kmpc_for_static_init_8u() unnamed_addr
19-
20-
!nvvm.annotations = !{!0}
21-
22-
!0 = !{ptr @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148, !"kernel", i32 1}

0 commit comments

Comments
 (0)