intel · bader · Aug 5, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 22, 2024
@@ -56,6 +56,7 @@
 #include "llvm/SYCLLowerIR/RecordSYCLAspectNames.h"
 #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
 #include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h"
+#include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h"
 #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h"
@@ -1151,6 +1152,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       // and before cleaning up metadata)
       MPM.addPass(RecordSYCLAspectNamesPass());
 
+      if (TargetTriple.isNVPTX())
+        MPM.addPass(SYCLCreateNVVMAnnotationsPass());
+
       // Remove SYCL metadata added by the frontend, like sycl_aspects
       // Note, this pass should be at the end of the pipeline
       MPM.addPass(CleanupSYCLMetadataPass());

@@ -291,80 +291,6 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
           addNVVMMetadata(F, "grid_constant", GridConstantParamIdxs);
       }
     }
-    bool HasMaxWorkGroupSize = false;
-    bool HasMinWorkGroupPerCU = false;
-    if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
-      HasMaxWorkGroupSize = true;
-      // We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
-      // with the fastest-moving dimension rightmost, to CUDA's, where X is the
-      // fastest-moving dimension.
-      addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
-      addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
-      addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
-    }
-
-    if (const auto *RWGS = FD->getAttr<SYCLReqdWorkGroupSizeAttr>()) {
-      llvm::SmallVector<std::optional<int64_t>, 3> Ops;
-      // Index-flip and pad out any missing elements. Note the misleading
-      // nomenclature of the methods: getXDimVal doesn't return the X dimension;
-      // it returns the left-most dimension (dim0). This could correspond to
-      // CUDA's X, Y, or Z, depending on the number of operands provided.
-      if (auto Dim0 = RWGS->getXDimVal())
-        Ops.push_back(Dim0->getExtValue());
-      if (auto Dim1 = RWGS->getYDimVal())
-        Ops.push_back(Dim1->getExtValue());
-      if (auto Dim2 = RWGS->getZDimVal())
-        Ops.push_back(Dim2->getExtValue());
-      std::reverse(Ops.begin(), Ops.end());
-      Ops.append(3 - Ops.size(), std::nullopt);
-
-      // Work-group sizes (in NVVM annotations) must be positive and less than
-      // INT32_MAX, whereas SYCL can allow for larger work-group sizes (see
-      // -fno-sycl-id-queries-fit-in-int). If any dimension is too large for
-      // NVPTX, don't emit any annotation at all.
-      if (llvm::all_of(Ops, [](std::optional<int64_t> V) {
-            return !V || llvm::isUInt<31>(*V);
-          })) {
-        if (auto X = Ops[0])
-          addNVVMMetadata(F, "reqntidx", *X);
-        if (auto Y = Ops[1])
-          addNVVMMetadata(F, "reqntidy", *Y);
-        if (auto Z = Ops[2])
-          addNVVMMetadata(F, "reqntidz", *Z);
-      }
-    }
-
-    auto attrValue = [&](Expr *E) {
-      const auto *CE = cast<ConstantExpr>(E);
-      std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
-      return Val->getZExtValue();
-    };
-
-    if (const auto *MWGPCU =
-            FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
-      if (!HasMaxWorkGroupSize && FD->hasAttr<OpenCLKernelAttr>()) {
-        M.getDiags().Report(D->getLocation(),
-                            diag::warn_launch_bounds_missing_attr)
-            << MWGPCU << 0;
-      } else {
-        // The value is guaranteed to be > 0, pass it to the metadata.
-        addNVVMMetadata(F, "minctasm", attrValue(MWGPCU->getValue()));
-        HasMinWorkGroupPerCU = true;
-      }
-    }
-
-    if (const auto *MWGPMP =
-            FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
-      if ((!HasMaxWorkGroupSize || !HasMinWorkGroupPerCU) &&
-          FD->hasAttr<OpenCLKernelAttr>()) {
-        M.getDiags().Report(D->getLocation(),
-                            diag::warn_launch_bounds_missing_attr)
-            << MWGPMP << 1;
-      } else {
-        // The value is guaranteed to be > 0, pass it to the metadata.
-        addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
-      }
-    }
   }
 
   // Perform special handling in CUDA mode.

@@ -4089,11 +4089,17 @@ bool static check32BitInt(const Expr *E, Sema &S, llvm::APSInt &I,
 
 void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
     Decl *D, const AttributeCommonInfo &CI, Expr *E) {
-  if (Context.getLangOpts().SYCLIsDevice &&
-      !Context.getTargetInfo().getTriple().isNVPTX()) {
-    Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
-        << CI << E->getSourceRange();
-    return;
+  if (Context.getLangOpts().SYCLIsDevice) {
+    if (!Context.getTargetInfo().getTriple().isNVPTX()) {
+      Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
+          << CI << E->getSourceRange();
+      return;
+    }
+
+    if (!D->hasAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
+      Diag(CI.getLoc(), diag::warn_launch_bounds_missing_attr) << CI << 0;
+      return;
+    }
   }
   if (!E->isValueDependent()) {
     // Validate that we have an integer constant expression and then store the
@@ -4154,6 +4160,12 @@ void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
           << CudaArchToString(SM) << CI << E->getSourceRange();
       return;
     }
+
+    if (!D->hasAttr<SYCLIntelMaxWorkGroupSizeAttr>() ||
+        !D->hasAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+      Diag(CI.getLoc(), diag::warn_launch_bounds_missing_attr) << CI << 1;
+      return;
+    }
   }
   if (!E->isValueDependent()) {
     // Validate that we have an integer constant expression and then store the

@@ -48,37 +48,6 @@ int main() {
 // CHECK: define dso_local void @{{.*}}kernel_name2() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
 // CHECK: define dso_local void @{{.*}}kernel_name3() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM]] !max_work_group_size ![[MWGS_2:[0-9]+]]
 
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
-
 // CHECK: ![[MWGPC]] = !{i32 2}
 // CHECK: ![[MWGPM]] = !{i32 4}
 // CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}

@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-NVPTX
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-NVPTX
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 #include "sycl.hpp"
 
@@ -123,55 +123,6 @@ int main() {
 // CHECK: define {{.*}} void @{{.*}}kernel_name22() #0 {{.*}} !work_group_num_dim ![[NDRWGS1D:[0-9]+]] !reqd_work_group_size ![[WGSIZE1D22:[0-9]+]]
 // CHECK: define {{.*}} void @{{.*}}kernel_name24() #0 {{.*}} !work_group_num_dim ![[NDRWGS1D:[0-9]+]] !reqd_work_group_size ![[WGSIZE1D2:[0-9]+]]
 
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidx", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidy", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidz", i32 32}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidx", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidy", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidz", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidy", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidz", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidy", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidz", i32 1}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidx", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidy", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidz", i32 32}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidx", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidy", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidz", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidy", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidz", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidy", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidz", i32 1}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name13, !"reqntidx", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name13, !"reqntidy", i32 32}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name13, !"reqntidz"
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name15, !"reqntidx", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name15, !"reqntidy", i32 8}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name15, !"reqntidz"
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name16, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name16, !"reqntidy", i32 2}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name16, !"reqntidz"
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name18, !"reqntidx", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name18, !"reqntidy", i32 1}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name18, !"reqntidz"
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name19, !"reqntidx", i32 32}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name19, !"reqntidy",
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name19, !"reqntidz",
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name21, !"reqntidx", i32 8}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name21, !"reqntidy",
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name21, !"reqntidz",
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name22, !"reqntidx", i32 2}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name22, !"reqntidy",
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name22, !"reqntidz",
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name24, !"reqntidx", i32 1}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name24, !"reqntidy",
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name24, !"reqntidz",
-
 // CHECK: ![[NDRWGS3D]] = !{i32 3}
 // CHECK: ![[WGSIZE3D32]] = !{i32 16, i32 16, i32 32}
 // CHECK: ![[WGSIZE3D88]] = !{i32 8, i32 8, i32 8}

@@ -48,9 +48,14 @@ int main() {
               intel::min_work_groups_per_cu(4),
               intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; });
 
-    // expected-error@+2 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}}
+    // expected-warning@+2 {{'min_work_groups_per_cu' attribute ignored, as it requires: maximum work group size to be also specified}}
     cgh.single_task<class T5>(
-        [=] [[intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; });
+        [=] [[intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; });
+
+    // expected-error@+3 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}}
+    cgh.single_task<class T5>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; });
   });
 
   Q.submit([&](sycl::handler &cgh) {

@@ -15,9 +15,9 @@ template <int N1, int N2, int N3> class Functor {
   operator()() const {}
 };
 
-// expected-warning@+1 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
 template <int N1, int N2> class Functor_2 {
 public:
+// expected-warning@+2 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
   [[intel::max_work_group_size(1, 1, N1),
     intel::max_work_groups_per_mp(N2)]] void
   operator()() const {}
@@ -32,7 +32,7 @@ int main() {
               intel::min_work_groups_per_cu(2),
               intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
 
-    // expected-warning@+2 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
+    // expected-warning@+3 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
     cgh.single_task<class T2>(
         [=] [[intel::max_work_group_size(1, 1, 256),
               intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });

@@ -0,0 +1,30 @@
+//===- SYCLCreateNVVMAnnotations.h - SYCLCreateNVVMAnnotationsPass --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers function metadata to NVVM annotations
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H
+#define LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class SYCLCreateNVVMAnnotationsPass
+    : public PassInfoMixin<SYCLCreateNVVMAnnotationsPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H
@@ -128,6 +128,7 @@
 #include "llvm/SYCLLowerIR/RecordSYCLAspectNames.h"
 #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
 #include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h"
+#include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h"
 #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h"

@@ -160,6 +160,7 @@ MODULE_PASS("sycl-propagate-joint-matrix-usage", SYCLPropagateJointMatrixUsagePa
 MODULE_PASS("sycl-add-opt-level-attribute", SYCLAddOptLevelAttributePass())
 MODULE_PASS("compile-time-properties", CompileTimePropertiesPass())
 MODULE_PASS("cleanup-sycl-metadata", CleanupSYCLMetadataPass())
+MODULE_PASS("sycl-create-nvvm-annotations", SYCLCreateNVVMAnnotationsPass())
 MODULE_PASS("lower-slm-reservation-calls", ESIMDLowerSLMReservationCalls())
 MODULE_PASS("record-sycl-aspect-names", RecordSYCLAspectNamesPass())
 MODULE_PASS("sycl-virtual-functions-analysis",

@@ -60,6 +60,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   SpecConstants.cpp
   SYCLAddOptLevelAttribute.cpp
   SYCLConditionalCallOnDevice.cpp
+  SYCLCreateNVVMAnnotations.cpp
   SYCLDeviceLibReqMask.cpp
   SYCLDeviceRequirements.cpp
   SYCLKernelParamOptInfo.cpp

@@ -12,6 +12,7 @@
 #include "llvm/SYCLLowerIR/DeviceGlobals.h"
 #include "llvm/SYCLLowerIR/ESIMD/ESIMDUtils.h"
 #include "llvm/SYCLLowerIR/HostPipes.h"
+#include "llvm/SYCLLowerIR/TargetHelpers.h"
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/StringMap.h"
@@ -367,7 +368,8 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
     SmallVector<StringRef, 3> ValStrs;
     Attr.getValueAsString().split(ValStrs, ',');
 
-    assert(ValStrs.size() <= 3 &&
+    size_t NumDims = ValStrs.size();
+    assert(NumDims <= 3 &&
            "sycl-work-group-size and sycl-work-group-size-hint currently only "
            "support up to three values");
 
@@ -384,6 +386,16 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
     for (StringRef ValStr : ValStrs)
       MDVals.push_back(ConstantAsMetadata::get(
           Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, ValStr, 10))));
+    while (MDVals.size() < 3)
+      MDVals.push_back(ConstantAsMetadata::get(
+          Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, 1, 10))));
+
+    if (NumDims < 3) {
+      if (!F.hasMetadata("work_group_num_dim"))
+        F.setMetadata("work_group_num_dim",
+                      MDNode::get(Ctx, ConstantAsMetadata::get(ConstantInt::get(
+                                           Type::getInt32Ty(Ctx), NumDims))));
+    }
 
     const char *MDName = (AttrKindStr == "sycl-work-group-size")
                              ? "reqd_work_group_size"
@@ -588,9 +600,13 @@ PreservedAnalyses CompileTimePropertiesPass::run(Module &M,
   }
 
   // Process all properties on kernels.
+  TargetHelpers::KernelCache HIPCUDAKCache;
+  HIPCUDAKCache.populateKernels(M);
+
   for (Function &F : M) {
     // Only consider kernels.
-    if (F.getCallingConv() != CallingConv::SPIR_KERNEL)
+    if (F.getCallingConv() != CallingConv::SPIR_KERNEL &&
+        !HIPCUDAKCache.isKernel(F))
       continue;
 
     // Compile time properties on kernel arguments