intel · martygrant · Jul 4, 2024 · Jul 3, 2024 · Jul 4, 2024 · Jul 4, 2024
@@ -1705,20 +1705,14 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
   let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
   let Subjects = SubjectList<[Function], ErrorDiag>;
   let AdditionalMembers = [{
-    std::optional<llvm::APSInt> getXDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getXDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getXDimVal() const {
+      return cast<ConstantExpr>(getXDim())->getResultAsAPSInt().getExtValue();
     }
-    std::optional<llvm::APSInt> getYDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getYDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getYDimVal() const {
+      return cast<ConstantExpr>(getYDim())->getResultAsAPSInt().getExtValue();
     }
-    std::optional<llvm::APSInt> getZDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getZDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getZDimVal() const {
+      return cast<ConstantExpr>(getZDim())->getResultAsAPSInt().getExtValue();
     }
   }];
   let Documentation = [SYCLIntelMaxWorkGroupSizeAttrDocs];

@@ -826,9 +826,9 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
     // Attributes arguments (first and third) are reversed on SYCLDevice.
     if (getLangOpts().SYCLIsDevice) {
       llvm::Metadata *AttrMDArgs[] = {
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getZDimVal())),
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getYDimVal())),
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getXDimVal()))};
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getZDimVal())),
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getYDimVal())),
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getXDimVal()))};
       Fn->setMetadata("max_work_group_size",
                       llvm::MDNode::get(Context, AttrMDArgs));
     }

@@ -252,13 +252,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
     bool HasMaxWorkGroupSize = false;
     bool HasMinWorkGroupPerCU = false;
     if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
-      auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
-                        (*MWGS->getYDimVal()).getExtValue() *
-                        (*MWGS->getXDimVal()).getExtValue();
-      if (MaxThreads > 0) {
-        addNVVMMetadata(F, "maxntidx", MaxThreads);
-        HasMaxWorkGroupSize = true;
-      }
+      HasMaxWorkGroupSize = true;
+      // We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
+      // with the fastest-moving dimension rightmost, to CUDA's, where X is the
+      // fastest-moving dimension.
+      addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
+      addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
+      addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
     }
 
     auto attrValue = [&](Expr *E) {

@@ -4,7 +4,7 @@
 // compute unit and maximum work groups per multi-processor attributes, that
 // correspond to CUDA's launch bounds. Expect max_work_group_size,
 // min_work_groups_per_cu and max_work_groups_per_mp that are mapped to
-// maxntidx, minctasm, and maxclusterrank NVVM annotations respectively.
+// maxntid[xyz], minctasm, and maxclusterrank NVVM annotations respectively.
 
 #include "sycl.hpp"
 
@@ -13,24 +13,24 @@ queue q;
 
 class Foo {
 public:
-  [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+  [[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
     intel::max_work_groups_per_mp(4)]] void
   operator()() const {}
 };
 
 template <int N> class Functor {
 public:
-  [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+  [[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
     intel::max_work_groups_per_mp(N)]] void
   operator()() const {}
 };
 
 template <int N>
-[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+[[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
   intel::max_work_groups_per_mp(N)]] void
 zoo() {}
 
-[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+[[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
   intel::max_work_groups_per_mp(4)]] void
 bar() {}
 
@@ -42,7 +42,7 @@ int main() {
 
     // Test attribute is applied on lambda.
     h.single_task<class kernel_name2>(
-        [] [[intel::max_work_group_size(8, 8, 8),
+        [] [[intel::max_work_group_size(2, 4, 8),
              intel::min_work_groups_per_cu(2),
              intel::max_work_groups_per_mp(4)]] () {});
 
@@ -65,41 +65,61 @@ int main() {
 // CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
 // CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]]
 
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidz", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16}
-// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidz", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16}
 
 // CHECK: ![[MWGPC]] = !{i32 2}
 // CHECK: ![[MWGPM]] = !{i32 4}
-// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8}
+// CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}
 // CHECK: ![[MWGPC_MWGPM]] = !{i32 6}
-// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6}
+// CHECK: ![[MWGS_2]] = !{i32 8, i32 4, i32 6}
 // CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16}
-// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16}
+// CHECK: ![[MWGS_3]] = !{i32 8, i32 4, i32 16}