[CUDA] Add support for __grid_constant__ attribute (llvm#114589)

Artem-B · web-flow · commit 7c3fdcc27603 · 2024-11-05T10:48:54.000-08:00
LLVM support for the attribute has been implemented already, so it just
plumbs it through to the CUDA front-end.

One notable difference from NVCC is that the attribute can be used
regardless of the targeted GPU. On the older GPUs it will just be
ignored. The attribute is a performance hint, and does not warrant a
hard error if compiler can't benefit from it on a particular GPU
variant.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
@@ -716,6 +716,7 @@ CUDA Support
 ^^^^^^^^^^^^
 - Clang now supports CUDA SDK up to 12.6
 - Added support for sm_100
+- Added support for `__grid_constant__` attribute.
 
 AIX Support
 ^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -1451,6 +1451,13 @@ def CUDAHost : InheritableAttr {
 }
 def : MutualExclusions<[CUDAGlobal, CUDAHost]>;
 
+def CUDAGridConstant : InheritableAttr {
+  let Spellings = [GNU<"grid_constant">, Declspec<"__grid_constant__">];
+  let Subjects = SubjectList<[ParmVar]>;
+  let LangOpts = [CUDA];
+  let Documentation = [CUDAGridConstantAttrDocs];
+}
+
 def NVPTXKernel : InheritableAttr, TargetSpecificAttr<TargetNVPTX> {
   let Spellings = [Clang<"nvptx_kernel">];
   let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
@@ -6794,6 +6794,16 @@ unbind runtime APIs.
   }];
 }
 
+def CUDAGridConstantAttrDocs : Documentation {
+  let Category = DocCatDecl;
+  let Content = [{
+The ``__grid_constant__`` attribute can be applied to a ``const``-qualified kernel
+function argument and allows compiler to take the address of that argument without
+making a copy. The argument applies to sm_70 or newer GPUs, during compilation
+with CUDA-11.7(PTX 7.7) or newer, and is ignored otherwise.
+  }];
+}
+
 def HIPManagedAttrDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9100,6 +9100,8 @@ def err_cuda_host_shared : Error<
     "%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
 def err_cuda_nonstatic_constdev: Error<"__constant__, __device__, and "
     "__managed__ are not allowed on non-static local variables">;
+def err_cuda_grid_constant_not_allowed : Error<
+  "__grid_constant__ is only allowed on const-qualified kernel parameters">;
 def err_cuda_ovl_target : Error<
   "%select{__device__|__global__|__host__|__host__ __device__}0 function %1 "
   "cannot overload %select{__device__|__global__|__host__|__host__ __device__}2 function %3">;
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -8,6 +8,7 @@
 
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
 using namespace clang;
@@ -78,7 +79,13 @@ class NVPTXTargetCodeGenInfo : public TargetCodeGenInfo {
   // Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
   // resulting MDNode to the nvvm.annotations MDNode.
   static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
-                              int Operand);
+                              int Operand,
+                              const SmallVectorImpl<int> &GridConstantArgs);
+
+  static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
+                              int Operand) {
+    addNVVMMetadata(GV, Name, Operand, SmallVector<int, 1>(0));
+  }
 
 private:
   static void emitBuiltinSurfTexDeviceCopy(CodeGenFunction &CGF, LValue Dst,
@@ -240,7 +247,8 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
   }
 
   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
-  if (!FD) return;
+  if (!FD)
+    return;
 
   llvm::Function *F = cast<llvm::Function>(GV);
 
@@ -263,8 +271,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
     // __global__ functions cannot be called from the device, we do not
     // need to set the noinline attribute.
     if (FD->hasAttr<CUDAGlobalAttr>()) {
+      SmallVector<int, 10> GCI;
+      for (auto IV : llvm::enumerate(FD->parameters()))
+        if (IV.value()->hasAttr<CUDAGridConstantAttr>())
+          // For some reason arg indices are 1-based in NVVM
+          GCI.push_back(IV.index() + 1);
       // Create !{<func-ref>, metadata !"kernel", i32 1} node
-      addNVVMMetadata(F, "kernel", 1);
+      addNVVMMetadata(F, "kernel", 1, GCI);
     }
     if (CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>())
       M.handleCUDALaunchBoundsAttr(F, Attr);
@@ -276,18 +289,27 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
   }
 }
 
-void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
-                                             StringRef Name, int Operand) {
+void NVPTXTargetCodeGenInfo::addNVVMMetadata(
+    llvm::GlobalValue *GV, StringRef Name, int Operand,
+    const SmallVectorImpl<int> &GridConstantArgs) {
   llvm::Module *M = GV->getParent();
   llvm::LLVMContext &Ctx = M->getContext();
 
   // Get "nvvm.annotations" metadata node
   llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
 
-  llvm::Metadata *MDVals[] = {
+  SmallVector<llvm::Metadata *, 5> MDVals = {
       llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
+  if (!GridConstantArgs.empty()) {
+    SmallVector<llvm::Metadata *, 10> GCM;
+    for (int I : GridConstantArgs)
+      GCM.push_back(llvm::ConstantAsMetadata::get(
+          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), I)));
+    MDVals.append({llvm::MDString::get(Ctx, "grid_constant"),
+                   llvm::MDNode::get(Ctx, GCM)});
+  }
   // Append metadata to nvvm.annotations
   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
 }
@@ -309,7 +331,7 @@ NVPTXTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
   return llvm::ConstantExpr::getAddrSpaceCast(
       llvm::ConstantPointerNull::get(NPT), PT);
 }
-}
+} // namespace
 
 void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
                                                const CUDALaunchBoundsAttr *Attr,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
@@ -12225,8 +12225,17 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
             << NewFD;
     }
 
-    if (!Redeclaration && LangOpts.CUDA)
+    if (!Redeclaration && LangOpts.CUDA) {
+      bool IsKernel = NewFD->hasAttr<CUDAGlobalAttr>();
+      for (auto *Parm : NewFD->parameters()) {
+        if (!Parm->getType()->isDependentType() &&
+            Parm->hasAttr<CUDAGridConstantAttr>() &&
+            !(IsKernel && Parm->getType().isConstQualified()))
+          Diag(Parm->getAttr<CUDAGridConstantAttr>()->getLocation(),
+               diag::err_cuda_grid_constant_not_allowed);
+      }
       CUDA().checkTargetOverload(NewFD, Previous);
+    }
   }
 
   // Check if the function definition uses any AArch64 SME features without
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -4748,6 +4748,15 @@ static void handleManagedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     D->addAttr(CUDADeviceAttr::CreateImplicit(S.Context));
 }
 
+static void handleGridConstantAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  if (D->isInvalidDecl())
+    return;
+  // Whether __grid_constant__ is allowed to be used will be checked in
+  // Sema::CheckFunctionDeclaration as we need complete function decl to make
+  // the call.
+  D->addAttr(::new (S.Context) CUDAGridConstantAttr(S.Context, AL));
+}
+
 static void handleGNUInlineAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   const auto *Fn = cast<FunctionDecl>(D);
   if (!Fn->isInlineSpecified()) {
@@ -6645,6 +6654,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_CUDADevice:
     handleDeviceAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_CUDAGridConstant:
+    handleGridConstantAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_HIPManaged:
     handleManagedAttr(S, D, AL);
     break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -876,6 +876,12 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
       continue;
     }
 
+    if (auto *A = dyn_cast<CUDAGridConstantAttr>(TmplAttr)) {
+      if (!New->hasAttr<CUDAGridConstantAttr>())
+        New->addAttr(A->clone(Context));
+      continue;
+    }
+
     assert(!TmplAttr->isPackExpansion());
     if (TmplAttr->isLateParsed() && LateAttrs) {
       // Late parsed attributes must be instantiated and attached after the
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -12,6 +12,7 @@
 #define __managed__ __attribute__((managed))
 #endif
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __grid_constant__ __attribute__((grid_constant))
 #else
 #define __constant__
 #define __device__
@@ -20,6 +21,7 @@
 #define __shared__
 #define __managed__
 #define __launch_bounds__(...)
+#define __grid_constant__
 #endif
 
 struct dim3 {
diff --git a/clang/test/CodeGenCUDA/grid-constant.cu b/clang/test/CodeGenCUDA/grid-constant.cu
@@ -0,0 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
+// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+struct S {};
+
+__global__ void kernel(__grid_constant__ const S gc_arg1, int arg2, __grid_constant__ const int gc_arg3) {}
+
+// dependent arguments get diagnosed after instantiation.
+template <typename T>
+__global__ void tkernel_const(__grid_constant__ const T arg) {}
+
+template <typename T>
+__global__ void tkernel(int dummy, __grid_constant__ T arg) {}
+
+void foo() {
+  tkernel_const<const S><<<1,1>>>({});
+  tkernel_const<S><<<1,1>>>({});
+  tkernel<const S><<<1,1>>>(1, {});
+}
+//.
+//.
+// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"kernel", i32 1, !"grid_constant", [[META1:![0-9]+]]}
+// CHECK: [[META1]] = !{i32 1, i32 3}
+// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3:![0-9]+]]}
+// CHECK: [[META3]] = !{i32 1}
+// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3]]}
+// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"kernel", i32 1, !"grid_constant", [[META6:![0-9]+]]}
+// CHECK: [[META6]] = !{i32 2}
+//.
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -39,6 +39,7 @@
 // CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
 // CHECK-NEXT: CUDADeviceBuiltinTextureType (SubjectMatchRule_record)
 // CHECK-NEXT: CUDAGlobal (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
 // CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
 // CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
 // CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -11,6 +11,7 @@
 #define __host__ __attribute__((host))
 #define __shared__ __attribute__((shared))
 #define __managed__ __attribute__((managed))
+#define __grid_constant__ __attribute__((grid_constant))
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
 
 struct dim3 {
diff --git a/clang/test/SemaCUDA/grid-constant.cu b/clang/test/SemaCUDA/grid-constant.cu
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s
+#include "Inputs/cuda.h"
+
+struct S {};
+
+__global__ void kernel_struct(__grid_constant__ const S arg) {}
+__global__ void kernel_scalar(__grid_constant__ const int arg) {}
+
+__global__ void gc_kernel_non_const(__grid_constant__ S arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}
+
+void non_kernel(__grid_constant__ S arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}
+
+// templates w/ non-dependent argument types get diagnosed right
+// away, without instantiation.
+template <typename T>
+__global__ void tkernel_nd_const(__grid_constant__ const S arg, T dummy) {}
+template <typename T>
+__global__ void tkernel_nd_non_const(__grid_constant__ S arg, T dummy) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}
+
+// dependent arguments get diagnosed after instantiation.
+template <typename T>
+__global__ void tkernel_const(__grid_constant__ const T arg) {}
+
+template <typename T>
+__global__ void tkernel(__grid_constant__ T arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}
+
+void foo() {
+  tkernel_const<const S><<<1,1>>>({});
+  tkernel_const<S><<<1,1>>>({});
+  tkernel<const S><<<1,1>>>({});
+  tkernel<S><<<1,1>>>({}); // expected-note {{in instantiation of function template specialization 'tkernel<S>' requested here}}
+}