[SYCL][NVPTX] Split max_work_group_size into 3 NVVM annotations (#14420)

frasercrmck · web-flow · commit ef62cadd678e · 2024-07-04T16:30:25.000+01:00
NVVM IR supports separated maxntidx, maxntidy, and maxntidz annotations.
The backend will print them individually as three dimensions. This
better preserves programmer intent than prematurely flattening them
together.

Note that the semantics are in fact identical; the CUDA implementation
internally multiplies all dimensions together and only guarantees that
the total is never exceeded, but not that any individual dimension is
not exceeded. Thus 64,1,1 is identical to 4,4,4.

We try and preserve a logical mapping of dimensions by index flipping
between SYCL (z,y,x) and NVVM (x,y,z) in CUDA terminology despite, as
mentioned above, it being largely irrelevant.

Also this patch simplifies the attribute's getter functions as all
dimensions are mandatory, and the getters seemed copied from the
reqd_work_group_size attribute where some are optional.

We could probably improve the code further by making the operands
"unsigned" and not "Expr", and renaming them from X,Y,Z to Dim{0,1,2} as
per the SYCL spec. This has been left for future work, however, as
there's a non-trivial amount of code that expects to be able to treat
the max_work_group_size and reqd_work_group_size attributes identically
through templates and identical helper methods.
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -1705,20 +1705,14 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
   let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
   let Subjects = SubjectList<[Function], ErrorDiag>;
   let AdditionalMembers = [{
-    std::optional<llvm::APSInt> getXDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getXDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getXDimVal() const {
+      return cast<ConstantExpr>(getXDim())->getResultAsAPSInt().getExtValue();
     }
-    std::optional<llvm::APSInt> getYDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getYDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getYDimVal() const {
+      return cast<ConstantExpr>(getYDim())->getResultAsAPSInt().getExtValue();
     }
-    std::optional<llvm::APSInt> getZDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getZDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getZDimVal() const {
+      return cast<ConstantExpr>(getZDim())->getResultAsAPSInt().getExtValue();
     }
   }];
   let Documentation = [SYCLIntelMaxWorkGroupSizeAttrDocs];
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -826,9 +826,9 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
     // Attributes arguments (first and third) are reversed on SYCLDevice.
     if (getLangOpts().SYCLIsDevice) {
       llvm::Metadata *AttrMDArgs[] = {
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getZDimVal())),
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getYDimVal())),
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getXDimVal()))};
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getZDimVal())),
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getYDimVal())),
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getXDimVal()))};
       Fn->setMetadata("max_work_group_size",
                       llvm::MDNode::get(Context, AttrMDArgs));
     }
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -252,13 +252,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
     bool HasMaxWorkGroupSize = false;
     bool HasMinWorkGroupPerCU = false;
     if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
-      auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
-                        (*MWGS->getYDimVal()).getExtValue() *
-                        (*MWGS->getXDimVal()).getExtValue();
-      if (MaxThreads > 0) {
-        addNVVMMetadata(F, "maxntidx", MaxThreads);
-        HasMaxWorkGroupSize = true;
-      }
+      HasMaxWorkGroupSize = true;
+      // We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
+      // with the fastest-moving dimension rightmost, to CUDA's, where X is the
+      // fastest-moving dimension.
+      addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
+      addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
+      addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
     }
 
     auto attrValue = [&](Expr *E) {
diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
@@ -4,7 +4,7 @@
 // compute unit and maximum work groups per multi-processor attributes, that
 // correspond to CUDA's launch bounds. Expect max_work_group_size,
 // min_work_groups_per_cu and max_work_groups_per_mp that are mapped to
-// maxntidx, minctasm, and maxclusterrank NVVM annotations respectively.
+// maxntid[xyz], minctasm, and maxclusterrank NVVM annotations respectively.
 
 #include "sycl.hpp"
 
@@ -13,24 +13,24 @@ queue q;
 
 class Foo {
 public:
-  [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+  [[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
     intel::max_work_groups_per_mp(4)]] void
   operator()() const {}
 };
 
 template <int N> class Functor {
 public:
-  [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+  [[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
     intel::max_work_groups_per_mp(N)]] void
   operator()() const {}
 };
 
 template <int N>
-[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+[[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
   intel::max_work_groups_per_mp(N)]] void
 zoo() {}
 
-[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+[[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
   intel::max_work_groups_per_mp(4)]] void
 bar() {}
 
@@ -42,7 +42,7 @@ int main() {
 
     // Test attribute is applied on lambda.
     h.single_task<class kernel_name2>(
-        [] [[intel::max_work_group_size(8, 8, 8),
+        [] [[intel::max_work_group_size(2, 4, 8),
              intel::min_work_groups_per_cu(2),
              intel::max_work_groups_per_mp(4)]] () {});
 
@@ -65,41 +65,61 @@ int main() {
 // CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
 // CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]]
 
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidz", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16}
-// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidz", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16}
 
 // CHECK: ![[MWGPC]] = !{i32 2}
 // CHECK: ![[MWGPM]] = !{i32 4}
-// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8}
+// CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}
 // CHECK: ![[MWGPC_MWGPM]] = !{i32 6}
-// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6}
+// CHECK: ![[MWGS_2]] = !{i32 8, i32 4, i32 6}
 // CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16}
-// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16}
+// CHECK: ![[MWGS_3]] = !{i32 8, i32 4, i32 16}