-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[CUDA] Add support for __grid_constant__ attribute #114589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-clang Author: Artem Belevich (Artem-B) ChangesLLVM support for the attribute has been implemented already, so it just plumbs it through to the CUDA front-end. One notable difference from NVCC is that the attribute can be used regardless of the targeted GPU. On the older GPUs it will just be ignored. The attribute is a performance hint, and does not warrant a hard error if compiler can't benefit from it on a particular GPU variant. Full diff: https://github.com/llvm/llvm-project/pull/114589.diff 12 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 402203f89e23a0..9466df98747e27 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -699,6 +699,7 @@ CUDA Support
^^^^^^^^^^^^
- Clang now supports CUDA SDK up to 12.6
- Added support for sm_100
+- Added support for `__grid_constant__` attribute.
AIX Support
^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 47c93b48175fc8..9925b46ab2505e 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1450,6 +1450,13 @@ def CUDAHost : InheritableAttr {
}
def : MutualExclusions<[CUDAGlobal, CUDAHost]>;
+def CUDAGridConstant : InheritableAttr {
+ let Spellings = [GNU<"grid_constant">, Declspec<"__grid_constant__">];
+ let Subjects = SubjectList<[ParmVar]>;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
def NVPTXKernel : InheritableAttr, TargetSpecificAttr<TargetNVPTX> {
let Spellings = [Clang<"nvptx_kernel">];
let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 34ff49d7238a7f..61ff4c4fb5d646 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9100,6 +9100,8 @@ def err_cuda_host_shared : Error<
"%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
def err_cuda_nonstatic_constdev: Error<"__constant__, __device__, and "
"__managed__ are not allowed on non-static local variables">;
+def err_cuda_grid_constant_not_allowed : Error<
+ "__grid_constant__ is only allowed on const-qualified kernel parameters">;
def err_cuda_ovl_target : Error<
"%select{__device__|__global__|__host__|__host__ __device__}0 function %1 "
"cannot overload %select{__device__|__global__|__host__|__host__ __device__}2 function %3">;
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index ec7f1c439b1881..0431d2cc4ddc39 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -8,6 +8,7 @@
#include "ABIInfoImpl.h"
#include "TargetInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
using namespace clang;
@@ -78,7 +79,13 @@ class NVPTXTargetCodeGenInfo : public TargetCodeGenInfo {
// Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
// resulting MDNode to the nvvm.annotations MDNode.
static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
- int Operand);
+ int Operand,
+ const SmallVectorImpl<int> &GridConstantArgs);
+
+ static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
+ int Operand) {
+ addNVVMMetadata(GV, Name, Operand, SmallVector<int, 1>(0));
+ }
private:
static void emitBuiltinSurfTexDeviceCopy(CodeGenFunction &CGF, LValue Dst,
@@ -240,7 +247,8 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
}
const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
- if (!FD) return;
+ if (!FD)
+ return;
llvm::Function *F = cast<llvm::Function>(GV);
@@ -263,8 +271,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
// __global__ functions cannot be called from the device, we do not
// need to set the noinline attribute.
if (FD->hasAttr<CUDAGlobalAttr>()) {
+ SmallVector<int, 10> GCI;
+ for (auto IV : llvm::enumerate(FD->parameters()))
+ if (IV.value()->hasAttr<CUDAGridConstantAttr>())
+ // For some reason arg indices are 1-based in NVVM
+ GCI.push_back(IV.index() + 1);
// Create !{<func-ref>, metadata !"kernel", i32 1} node
- addNVVMMetadata(F, "kernel", 1);
+ addNVVMMetadata(F, "kernel", 1, GCI);
}
if (CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>())
M.handleCUDALaunchBoundsAttr(F, Attr);
@@ -276,18 +289,27 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
}
}
-void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
- StringRef Name, int Operand) {
+void NVPTXTargetCodeGenInfo::addNVVMMetadata(
+ llvm::GlobalValue *GV, StringRef Name, int Operand,
+ const SmallVectorImpl<int> &GridConstantArgs) {
llvm::Module *M = GV->getParent();
llvm::LLVMContext &Ctx = M->getContext();
// Get "nvvm.annotations" metadata node
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
- llvm::Metadata *MDVals[] = {
+ SmallVector<llvm::Metadata *, 5> MDVals = {
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
+ if (!GridConstantArgs.empty()) {
+ SmallVector<llvm::Metadata *, 10> GCM;
+ for (int I : GridConstantArgs)
+ GCM.push_back(llvm::ConstantAsMetadata::get(
+ llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), I)));
+ MDVals.append({llvm::MDString::get(Ctx, "grid_constant"),
+ llvm::MDNode::get(Ctx, GCM)});
+ }
// Append metadata to nvvm.annotations
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
}
@@ -309,7 +331,7 @@ NVPTXTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
return llvm::ConstantExpr::getAddrSpaceCast(
llvm::ConstantPointerNull::get(NPT), PT);
}
-}
+} // namespace
void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
const CUDALaunchBoundsAttr *Attr,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index f8e5f3c6d309d6..9de8cbc303016c 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -12222,8 +12222,17 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
<< NewFD;
}
- if (!Redeclaration && LangOpts.CUDA)
+ if (!Redeclaration && LangOpts.CUDA) {
+ bool IsKernel = NewFD->hasAttr<CUDAGlobalAttr>();
+ for (auto *Parm : NewFD->parameters()) {
+ if (!Parm->getType()->isDependentType() &&
+ Parm->hasAttr<CUDAGridConstantAttr>() &&
+ !(IsKernel && Parm->getType().isConstQualified()))
+ Diag(Parm->getAttr<CUDAGridConstantAttr>()->getLocation(),
+ diag::err_cuda_grid_constant_not_allowed);
+ }
CUDA().checkTargetOverload(NewFD, Previous);
+ }
}
// Check if the function definition uses any AArch64 SME features without
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 601c6f2eef1d9c..d8550bab3eddd2 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -4748,6 +4748,15 @@ static void handleManagedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
D->addAttr(CUDADeviceAttr::CreateImplicit(S.Context));
}
+static void handleGridConstantAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ if (D->isInvalidDecl())
+ return;
+ // Whether __grid_constant__ is allowed to be used will be checked in
+ // Sema::CheckFunctionDeclaration as we need complete function decl to make
+ // the call.
+ D->addAttr(::new (S.Context) CUDAGridConstantAttr(S.Context, AL));
+}
+
static void handleGNUInlineAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
const auto *Fn = cast<FunctionDecl>(D);
if (!Fn->isInlineSpecified()) {
@@ -6642,6 +6651,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDADevice:
handleDeviceAttr(S, D, AL);
break;
+ case ParsedAttr::AT_CUDAGridConstant:
+ handleGridConstantAttr(S, D, AL);
+ break;
case ParsedAttr::AT_HIPManaged:
handleManagedAttr(S, D, AL);
break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 3e948232057afe..ec3c3ce6057264 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -876,6 +876,12 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
continue;
}
+ if (auto *A = dyn_cast<CUDAGridConstantAttr>(TmplAttr)) {
+ if (!New->hasAttr<CUDAGridConstantAttr>())
+ New->addAttr(A->clone(Context));
+ continue;
+ }
+
assert(!TmplAttr->isPackExpansion());
if (TmplAttr->isLateParsed() && LateAttrs) {
// Late parsed attributes must be instantiated and attached after the
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc760500e65d41..a8d85afb7cd21c 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -12,6 +12,7 @@
#define __managed__ __attribute__((managed))
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __grid_constant__ __attribute__((grid_constant))
#else
#define __constant__
#define __device__
@@ -20,6 +21,7 @@
#define __shared__
#define __managed__
#define __launch_bounds__(...)
+#define __grid_constant__
#endif
struct dim3 {
diff --git a/clang/test/CodeGenCUDA/grid-constant.cu b/clang/test/CodeGenCUDA/grid-constant.cu
new file mode 100644
index 00000000000000..8d4be9c9dc7e1e
--- /dev/null
+++ b/clang/test/CodeGenCUDA/grid-constant.cu
@@ -0,0 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+struct S {};
+
+__global__ void kernel(__grid_constant__ const S gc_arg1, int arg2, __grid_constant__ const int gc_arg3) {}
+
+// dependent arguments get diagnosed after instantiation.
+template <typename T>
+__global__ void tkernel_const(__grid_constant__ const T arg) {}
+
+template <typename T>
+__global__ void tkernel(int dummy, __grid_constant__ T arg) {}
+
+void foo() {
+ tkernel_const<const S><<<1,1>>>({});
+ tkernel_const<S><<<1,1>>>({});
+ tkernel<const S><<<1,1>>>(1, {});
+}
+//.
+//.
+// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"kernel", i32 1, !"grid_constant", [[META1:![0-9]+]]}
+// CHECK: [[META1]] = !{i32 1, i32 3}
+// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3:![0-9]+]]}
+// CHECK: [[META3]] = !{i32 1}
+// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3]]}
+// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"kernel", i32 1, !"grid_constant", [[META6:![0-9]+]]}
+// CHECK: [[META6]] = !{i32 2}
+//.
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index e28b0775410c0a..b159a45c25a7f4 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -39,6 +39,7 @@
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
// CHECK-NEXT: CUDADeviceBuiltinTextureType (SubjectMatchRule_record)
// CHECK-NEXT: CUDAGlobal (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 405ef8bb807d90..10db947d8246ca 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -11,6 +11,7 @@
#define __host__ __attribute__((host))
#define __shared__ __attribute__((shared))
#define __managed__ __attribute__((managed))
+#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
struct dim3 {
diff --git a/clang/test/SemaCUDA/grid-constant.cu b/clang/test/SemaCUDA/grid-constant.cu
new file mode 100644
index 00000000000000..876e389355fd4b
--- /dev/null
+++ b/clang/test/SemaCUDA/grid-constant.cu
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s
+#include "Inputs/cuda.h"
+
+struct S {};
+
+__global__ void kernel_struct(__grid_constant__ const S arg) {}
+__global__ void kernel_scalar(__grid_constant__ const int arg) {}
+
+__global__ void gc_kernel_non_const(__grid_constant__ S arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}
+
+void non_kernel(__grid_constant__ S arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}
+
+// templates w/ non-dependent argument types get diagnosed right
+// away, without instantiation.
+template <typename T>
+__global__ void tkernel_nd_const(__grid_constant__ const S arg, T dummy) {}
+template <typename T>
+__global__ void tkernel_nd_non_const(__grid_constant__ S arg, T dummy) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}
+
+// dependent arguments get diagnosed after instantiation.
+template <typename T>
+__global__ void tkernel_const(__grid_constant__ const T arg) {}
+
+template <typename T>
+__global__ void tkernel(__grid_constant__ T arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}
+
+void foo() {
+ tkernel_const<const S><<<1,1>>>({});
+ tkernel_const<S><<<1,1>>>({});
+ tkernel<const S><<<1,1>>>({});
+ tkernel<S><<<1,1>>>({}); // expected-note {{in instantiation of function template specialization 'tkernel<S>' requested here}}
+}
|
The LLVM-facing parts of this lgtm |
25d7b38
to
0f13ebf
Compare
LLVM support for the attribute has been implemented already, so it just plumbs it through to the CUDA front-end.
One notable difference from NVCC is that the attribute can be used regardless of the targeted GPU. On the older GPUs it will just be ignored. The attribute is a performance hint, and does not warrant a hard error if compiler can't benefit from it on a particular GPU variant.