intel
diff --git a/‎clang/include/clang/Basic/Attr.td
Lines changed: 6 additions & 12 deletions b/‎clang/include/clang/Basic/Attr.td
Lines changed: 6 additions & 12 deletions
diff --git a/‎clang/include/clang/Basic/AttrDocs.td
Lines changed: 42 additions & 35 deletions b/‎clang/include/clang/Basic/AttrDocs.td
Lines changed: 42 additions & 35 deletions
diff --git a/‎clang/lib/CodeGen/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎clang/lib/CodeGen/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎clang/lib/CodeGen/CodeGenFunction.cpp
Lines changed: 3 additions & 3 deletions b/‎clang/lib/CodeGen/CodeGenFunction.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎clang/lib/CodeGen/Targets/NVPTX.cpp
Lines changed: 8 additions & 8 deletions b/‎clang/lib/CodeGen/Targets/NVPTX.cpp
Lines changed: 8 additions & 8 deletions
diff --git a/‎clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
Lines changed: 49 additions & 29 deletions b/‎clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
Lines changed: 49 additions & 29 deletions
@@ -1704,20 +1704,14 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
   let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
   let Subjects = SubjectList<[Function], ErrorDiag>;
   let AdditionalMembers = [{
-    std::optional<llvm::APSInt> getXDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getXDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getXDimVal() const {
+      return cast<ConstantExpr>(getXDim())->getResultAsAPSInt().getExtValue();
     }
-    std::optional<llvm::APSInt> getYDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getYDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getYDimVal() const {
+      return cast<ConstantExpr>(getYDim())->getResultAsAPSInt().getExtValue();
     }
-    std::optional<llvm::APSInt> getZDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getZDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getZDimVal() const {
+      return cast<ConstantExpr>(getZDim())->getResultAsAPSInt().getExtValue();
     }
   }];
   let Documentation = [SYCLIntelMaxWorkGroupSizeAttrDocs];
 
@@ -2980,18 +2980,32 @@ argument to **clEnqueueNDRangeKernel** (in OpenCL) or to
 **parallel_for** in SYCL. This allows the compiler to optimize the
 generated code appropriately for the kernel to which attribute is applied.
 
-While semantic of this attribute is the same between OpenCL and SYCL,
-spelling is a bit different: 
-
-SYCL 2020 describes the ``[[sycl::reqd_work_group_size(X, Y, Z)]]`` spelling
-in detail. This attribute indicates that the kernel must be launched with the
-specified work-group size. The order of the arguments matches the constructor
-of the group class. Each argument to the attribute must be an integral constant
-expression. The dimensionality of the attribute variant used must match the
-dimensionality of the work-group used to invoke the kernel. This spelling
-allows the Y and Z arguments to be optional. If not provided by the user, the
-value of Y and Z defaults to 1. See section 5.8.1 Kernel Attributes for more
-details.
+The arguments to ``reqd_work_group_size`` are ordered based on which index
+increments the fastest. In OpenCL, the first argument is the index that
+increments the fastest. In SYCL, the last argument is the index that increments
+the fastest.
+
+In OpenCL C, this attribute is available with the GNU spelling
+(``__attribute__((reqd_work_group_size(X, Y, Z)))``) and all three arguments
+are required. See section 6.7.2 Optional Attribute Qualifiers of OpenCL 1.2
+specification for details.
+
+.. code-block:: c++
+
+  __kernel __attribute__((reqd_work_group_size(8, 16, 32))) void test() {}
+
+In SYCL, the attribute accepts either one, two, or three arguments; in each
+form, the last (or only) argument is the index that increments fastest. The
+number of arguments passed to the attribute must match the dimensionality of
+the kernel the attribute is applied to.
+
+SYCL 2020 describes the ``[[sycl::reqd_work_group_size(dim0, dim1, dim2)]]``
+spelling in detail. This attribute indicates that the kernel must be launched
+with the specified work-group size. The order of the arguments matches the
+constructor of the ``range`` class. Each argument to the attribute must be an
+integral constant expression. The dimensionality of the attribute variant used
+must match the dimensionality of the work-group used to invoke the kernel. See
+section 5.8.1 Kernel Attributes for more details.
 
 In SYCL 1.2.1 mode, the ``cl::reqd_work_group_size`` and
 ``sycl::reqd_work_group_size`` attributes are propagated from the function they
@@ -3016,18 +3030,9 @@ attributes are not propagated to the kernel.
   template <int N, int N1, int N2>
   [[sycl::reqd_work_group_size(N, N1, N2)]] void func() {}
 
-The ``[[cl::reqd_work_group_size(X, Y, Z)]]`` and
-``__attribute__((reqd_work_group_size(X, Y, Z)))`` spellings are both
+The ``[[cl::reqd_work_group_size(dim0, dim1, dim2)]]`` and
+``__attribute__((reqd_work_group_size(dim0, dim1, dim2)))`` spellings are both
 deprecated in SYCL 2020.
-
-In OpenCL C, this attribute is available with the GNU spelling
-(``__attribute__((reqd_work_group_size(X, Y, Z)))``), see section
-6.7.2 Optional Attribute Qualifiers of OpenCL 1.2 specification for details.
-
-.. code-block:: c++
-
-  __kernel __attribute__((reqd_work_group_size(8, 16, 32))) void test() {}
-
   }];
 }
 
@@ -3041,6 +3046,15 @@ unsigned. The number of dimensional values defined provide additional
 information to the compiler on the dimensionality most likely to be used when
 launching the kernel at runtime.
 
+The arguments to ``work_group_size_hint`` are ordered based on which index
+increments the fastest. In OpenCL, the first argument is the index that
+increments the fastest. In SYCL, the last argument is the index that increments
+the fastest.
+
+In OpenCL C, this attribute is available with the GNU spelling
+(``__attribute__((work_group_size_hint(X, Y, Z)))``) and all three arguments
+are required.
+
 The GNU spelling is deprecated in SYCL mode.
 
 .. code-block:: c++
@@ -3052,15 +3066,6 @@ The GNU spelling is deprecated in SYCL mode.
     [[sycl::work_group_size_hint(2, 2, 2)]] void operator()() const {}
   };
 
-The arguments to ``work_group_size_hint`` are ordered based on which index
-increments the fastest. In OpenCL, the first argument is the index that
-increments the fastest, and in SYCL, the last argument is the index that
-increments the fastest.
-
-In OpenCL C, this attribute is available with the GNU spelling
-(``__attribute__((work_group_size_hint(X, Y, Z)))``) and all 
-three arguments are required.
-
 In SYCL, the attribute accepts either one, two, or three arguments; in each
 form, the last (or only) argument is the index that increments fastest. The
 number of arguments passed to the attribute must match the dimensionality of
@@ -3077,9 +3082,11 @@ def SYCLIntelMaxWorkGroupSizeAttrDocs : Documentation {
   let Heading = "intel::max_work_group_size";
   let Content = [{
 Applies to a device function/lambda function. Indicates the maximum dimensions
-of a work group. Values must be positive integers. This is similar to
-reqd_work_group_size, but allows work groups that are smaller or equal to the
-specified sizes.
+of a work group. Values must be positive integers. This attribute behaves
+similarly to ``reqd_work_group_size``, but allows work groups that are smaller
+or equal to the specified sizes. The dimensionality behaves the same as with
+the SYCL ``reqd_work_group_size`` attribute, but *all* dimensions must be
+provided.
 
 In SYCL 1.2.1 mode, the ``intel::max_work_group_size`` attribute is propagated
 from the function it is applied to onto the kernel which calls the function.
 
@@ -29,6 +29,7 @@ set(LLVM_LINK_COMPONENTS
   ScalarOpts
   Support
   SYCLLowerIR
+  SYCLNativeCPUUtils
   Target
   TargetParser
   TransformUtils
 
@@ -826,9 +826,9 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
     // Attributes arguments (first and third) are reversed on SYCLDevice.
     if (getLangOpts().SYCLIsDevice) {
       llvm::Metadata *AttrMDArgs[] = {
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getZDimVal())),
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getYDimVal())),
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getXDimVal()))};
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getZDimVal())),
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getYDimVal())),
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getXDimVal()))};
       Fn->setMetadata("max_work_group_size",
                       llvm::MDNode::get(Context, AttrMDArgs));
     }
 
@@ -252,13 +252,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
     bool HasMaxWorkGroupSize = false;
     bool HasMinWorkGroupPerCU = false;
     if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
-      auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
-                        (*MWGS->getYDimVal()).getExtValue() *
-                        (*MWGS->getXDimVal()).getExtValue();
-      if (MaxThreads > 0) {
-        addNVVMMetadata(F, "maxntidx", MaxThreads);
-        HasMaxWorkGroupSize = true;
-      }
+      HasMaxWorkGroupSize = true;
+      // We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
+      // with the fastest-moving dimension rightmost, to CUDA's, where X is the
+      // fastest-moving dimension.
+      addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
+      addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
+      addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
     }
 
     auto attrValue = [&](Expr *E) {
@@ -275,7 +275,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
             << MWGPCU << 0;
       } else {
         // The value is guaranteed to be > 0, pass it to the metadata.
-        addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue()));
+        addNVVMMetadata(F, "minctasm", attrValue(MWGPCU->getValue()));
         HasMinWorkGroupPerCU = true;
       }
     }
 
@@ -4,7 +4,7 @@
 // compute unit and maximum work groups per multi-processor attributes, that
 // correspond to CUDA's launch bounds. Expect max_work_group_size,
 // min_work_groups_per_cu and max_work_groups_per_mp that are mapped to
-// maxntidx, minnctapersm, maxclusterrank PTX directives respectively.
+// maxntid[xyz], minctasm, and maxclusterrank NVVM annotations respectively.
 
 #include "sycl.hpp"
 
@@ -13,24 +13,24 @@ queue q;
 
 class Foo {
 public:
-  [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+  [[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
     intel::max_work_groups_per_mp(4)]] void
   operator()() const {}
 };
 
 template <int N> class Functor {
 public:
-  [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+  [[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
     intel::max_work_groups_per_mp(N)]] void
   operator()() const {}
 };
 
 template <int N>
-[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+[[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
   intel::max_work_groups_per_mp(N)]] void
 zoo() {}
 
-[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+[[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
   intel::max_work_groups_per_mp(4)]] void
 bar() {}
 
@@ -42,7 +42,7 @@ int main() {
 
     // Test attribute is applied on lambda.
     h.single_task<class kernel_name2>(
-        [] [[intel::max_work_group_size(8, 8, 8),
+        [] [[intel::max_work_group_size(2, 4, 8),
              intel::min_work_groups_per_cu(2),
              intel::max_work_groups_per_mp(4)]] () {});
 
@@ -65,41 +65,61 @@ int main() {
 // CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
 // CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]]
 
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"minnctapersm", i32 6}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minnctapersm", i32 6}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512}
-// CHECK: {{.*}}@{{.*}}kernel_name4, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidz", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512}
-// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidz", i32 2}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024}
-// CHECK: {{.*}}@{{.*}}kernel_name5, !"minnctapersm", i32 16}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidz", i32 16}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16}
-// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024}
-// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minnctapersm", i32 16}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidz", i32 16}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16}
 
 // CHECK: ![[MWGPC]] = !{i32 2}
 // CHECK: ![[MWGPM]] = !{i32 4}
-// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8}
+// CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}
 // CHECK: ![[MWGPC_MWGPM]] = !{i32 6}
-// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6}
+// CHECK: ![[MWGS_2]] = !{i32 8, i32 4, i32 6}
 // CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16}
-// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16}
+// CHECK: ![[MWGS_3]] = !{i32 8, i32 4, i32 16}