[NVPTX][AMDGPU] Move annotation creation out of clang (#14634)

frasercrmck · web-flow · commit 3d73d9b367a8 · 2024-08-05T10:03:53.000-07:00
This patch refactors the way we lower SYCL attributes and properties to
NVVM annotations, through function attributes and metadata. It unifies
the flow better with the SPIR-V paths at the same time.

Previously we had:

1. Clang handling function attributes in two places:
1. CodeGenFunction did generic lowering of attributes to function
metadata
2. NVPTXTargetCodeGenInfo did its own additional lowering of attributes
to NVVM annotations
2. Kernel properties being handled in clang. NVPTXTargetCodeGenInfo
lowered kernel properties, which had already been converted to function
attributes, to NVVM annotations.
3. Kernel properties for HIP/CUDA *not* being turned into function
metadata. Because function metadata is how the ComputeModuleRuntimeInfo
library creates runtime info, this meant that target backends couldn't
act on properties, as they were lost during lowering.

Fundamentally, clang is not the correct place for handling SYCL kernel
properties as it unnecessarily touches the front-end for what is really
library code.

With this patch, we now have:

1. Clang handles function attributes in one place, lowering to function
metadata
2. SYCLLowerIR lowers kernel properties to the same function metadata:
now working for HIP and CUDA kernels too. Because function metadata is
consistently present for kernel properties, they behave as expected on
HIP/CUDA targets now.
3. A new pass converts function metadata to NVVM annotations, replacing
the code we've taken out of clang.
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
@@ -56,6 +56,7 @@
 #include "llvm/SYCLLowerIR/RecordSYCLAspectNames.h"
 #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
 #include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h"
+#include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h"
 #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h"
@@ -1151,6 +1152,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       // and before cleaning up metadata)
       MPM.addPass(RecordSYCLAspectNamesPass());
 
+      if (TargetTriple.isNVPTX())
+        MPM.addPass(SYCLCreateNVVMAnnotationsPass());
+
       // Remove SYCL metadata added by the frontend, like sycl_aspects
       // Note, this pass should be at the end of the pipeline
       MPM.addPass(CleanupSYCLMetadataPass());
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -291,80 +291,6 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
           addNVVMMetadata(F, "grid_constant", GridConstantParamIdxs);
       }
     }
-    bool HasMaxWorkGroupSize = false;
-    bool HasMinWorkGroupPerCU = false;
-    if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
-      HasMaxWorkGroupSize = true;
-      // We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
-      // with the fastest-moving dimension rightmost, to CUDA's, where X is the
-      // fastest-moving dimension.
-      addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
-      addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
-      addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
-    }
-
-    if (const auto *RWGS = FD->getAttr<SYCLReqdWorkGroupSizeAttr>()) {
-      llvm::SmallVector<std::optional<int64_t>, 3> Ops;
-      // Index-flip and pad out any missing elements. Note the misleading
-      // nomenclature of the methods: getXDimVal doesn't return the X dimension;
-      // it returns the left-most dimension (dim0). This could correspond to
-      // CUDA's X, Y, or Z, depending on the number of operands provided.
-      if (auto Dim0 = RWGS->getXDimVal())
-        Ops.push_back(Dim0->getExtValue());
-      if (auto Dim1 = RWGS->getYDimVal())
-        Ops.push_back(Dim1->getExtValue());
-      if (auto Dim2 = RWGS->getZDimVal())
-        Ops.push_back(Dim2->getExtValue());
-      std::reverse(Ops.begin(), Ops.end());
-      Ops.append(3 - Ops.size(), std::nullopt);
-
-      // Work-group sizes (in NVVM annotations) must be positive and less than
-      // INT32_MAX, whereas SYCL can allow for larger work-group sizes (see
-      // -fno-sycl-id-queries-fit-in-int). If any dimension is too large for
-      // NVPTX, don't emit any annotation at all.
-      if (llvm::all_of(Ops, [](std::optional<int64_t> V) {
-            return !V || llvm::isUInt<31>(*V);
-          })) {
-        if (auto X = Ops[0])
-          addNVVMMetadata(F, "reqntidx", *X);
-        if (auto Y = Ops[1])
-          addNVVMMetadata(F, "reqntidy", *Y);
-        if (auto Z = Ops[2])
-          addNVVMMetadata(F, "reqntidz", *Z);
-      }
-    }
-
-    auto attrValue = [&](Expr *E) {
-      const auto *CE = cast<ConstantExpr>(E);
-      std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
-      return Val->getZExtValue();
-    };
-
-    if (const auto *MWGPCU =
-            FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
-      if (!HasMaxWorkGroupSize && FD->hasAttr<OpenCLKernelAttr>()) {
-        M.getDiags().Report(D->getLocation(),
-                            diag::warn_launch_bounds_missing_attr)
-            << MWGPCU << 0;
-      } else {
-        // The value is guaranteed to be > 0, pass it to the metadata.
-        addNVVMMetadata(F, "minctasm", attrValue(MWGPCU->getValue()));
-        HasMinWorkGroupPerCU = true;
-      }
-    }
-
-    if (const auto *MWGPMP =
-            FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
-      if ((!HasMaxWorkGroupSize || !HasMinWorkGroupPerCU) &&
-          FD->hasAttr<OpenCLKernelAttr>()) {
-        M.getDiags().Report(D->getLocation(),
-                            diag::warn_launch_bounds_missing_attr)
-            << MWGPMP << 1;
-      } else {
-        // The value is guaranteed to be > 0, pass it to the metadata.
-        addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
-      }
-    }
   }
 
   // Perform special handling in CUDA mode.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -4089,11 +4089,17 @@ bool static check32BitInt(const Expr *E, Sema &S, llvm::APSInt &I,
 
 void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
     Decl *D, const AttributeCommonInfo &CI, Expr *E) {
-  if (Context.getLangOpts().SYCLIsDevice &&
-      !Context.getTargetInfo().getTriple().isNVPTX()) {
-    Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
-        << CI << E->getSourceRange();
-    return;
+  if (Context.getLangOpts().SYCLIsDevice) {
+    if (!Context.getTargetInfo().getTriple().isNVPTX()) {
+      Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
+          << CI << E->getSourceRange();
+      return;
+    }
+
+    if (!D->hasAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
+      Diag(CI.getLoc(), diag::warn_launch_bounds_missing_attr) << CI << 0;
+      return;
+    }
   }
   if (!E->isValueDependent()) {
     // Validate that we have an integer constant expression and then store the
@@ -4154,6 +4160,12 @@ void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
           << CudaArchToString(SM) << CI << E->getSourceRange();
       return;
     }
+
+    if (!D->hasAttr<SYCLIntelMaxWorkGroupSizeAttr>() ||
+        !D->hasAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+      Diag(CI.getLoc(), diag::warn_launch_bounds_missing_attr) << CI << 1;
+      return;
+    }
   }
   if (!E->isValueDependent()) {
     // Validate that we have an integer constant expression and then store the
diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
@@ -48,37 +48,6 @@ int main() {
 // CHECK: define dso_local void @{{.*}}kernel_name2() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
 // CHECK: define dso_local void @{{.*}}kernel_name3() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM]] !max_work_group_size ![[MWGS_2:[0-9]+]]
 
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
-
 // CHECK: ![[MWGPC]] = !{i32 2}
 // CHECK: ![[MWGPM]] = !{i32 4}
 // CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}
diff --git a/clang/test/CodeGenSYCL/reqd-work-group-size.cpp b/clang/test/CodeGenSYCL/reqd-work-group-size.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-NVPTX
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-NVPTX
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
 
 #include "sycl.hpp"
 
@@ -123,55 +123,6 @@ int main() {
 // CHECK: define {{.*}} void @{{.*}}kernel_name22() #0 {{.*}} !work_group_num_dim ![[NDRWGS1D:[0-9]+]] !reqd_work_group_size ![[WGSIZE1D22:[0-9]+]]
 // CHECK: define {{.*}} void @{{.*}}kernel_name24() #0 {{.*}} !work_group_num_dim ![[NDRWGS1D:[0-9]+]] !reqd_work_group_size ![[WGSIZE1D2:[0-9]+]]
 
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidx", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidy", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidz", i32 32}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidx", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidy", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidz", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidy", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidz", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidy", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidz", i32 1}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidx", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidy", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidz", i32 32}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidx", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidy", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidz", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidy", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidz", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidy", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidz", i32 1}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name13, !"reqntidx", i32 16}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name13, !"reqntidy", i32 32}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name13, !"reqntidz"
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name15, !"reqntidx", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name15, !"reqntidy", i32 8}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name15, !"reqntidz"
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name16, !"reqntidx", i32 2}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name16, !"reqntidy", i32 2}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name16, !"reqntidz"
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name18, !"reqntidx", i32 8}
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name18, !"reqntidy", i32 1}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name18, !"reqntidz"
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name19, !"reqntidx", i32 32}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name19, !"reqntidy",
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name19, !"reqntidz",
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name21, !"reqntidx", i32 8}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name21, !"reqntidy",
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name21, !"reqntidz",
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name22, !"reqntidx", i32 2}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name22, !"reqntidy",
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name22, !"reqntidz",
-// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name24, !"reqntidx", i32 1}
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name24, !"reqntidy",
-// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name24, !"reqntidz",
-
 // CHECK: ![[NDRWGS3D]] = !{i32 3}
 // CHECK: ![[WGSIZE3D32]] = !{i32 16, i32 16, i32 32}
 // CHECK: ![[WGSIZE3D88]] = !{i32 8, i32 8, i32 8}
diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp
@@ -48,9 +48,14 @@ int main() {
               intel::min_work_groups_per_cu(4),
               intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; });
 
-    // expected-error@+2 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}}
+    // expected-warning@+2 {{'min_work_groups_per_cu' attribute ignored, as it requires: maximum work group size to be also specified}}
     cgh.single_task<class T5>(
-        [=] [[intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; });
+        [=] [[intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; });
+
+    // expected-error@+3 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}}
+    cgh.single_task<class T5>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; });
   });
 
   Q.submit([&](sycl::handler &cgh) {
diff --git a/clang/test/SemaSYCL/lb_sm_90.cpp b/clang/test/SemaSYCL/lb_sm_90.cpp
@@ -15,9 +15,9 @@ template <int N1, int N2, int N3> class Functor {
   operator()() const {}
 };
 
-// expected-warning@+1 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
 template <int N1, int N2> class Functor_2 {
 public:
+// expected-warning@+2 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
   [[intel::max_work_group_size(1, 1, N1),
     intel::max_work_groups_per_mp(N2)]] void
   operator()() const {}
@@ -32,7 +32,7 @@ int main() {
               intel::min_work_groups_per_cu(2),
               intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
 
-    // expected-warning@+2 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
+    // expected-warning@+3 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
     cgh.single_task<class T2>(
         [=] [[intel::max_work_group_size(1, 1, 256),
               intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h b/llvm/include/llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h
@@ -0,0 +1,30 @@
+//===- SYCLCreateNVVMAnnotations.h - SYCLCreateNVVMAnnotationsPass --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers function metadata to NVVM annotations
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H
+#define LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class SYCLCreateNVVMAnnotationsPass
+    : public PassInfoMixin<SYCLCreateNVVMAnnotationsPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
@@ -128,6 +128,7 @@
 #include "llvm/SYCLLowerIR/RecordSYCLAspectNames.h"
 #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
 #include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h"
+#include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h"
 #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
@@ -160,6 +160,7 @@ MODULE_PASS("sycl-propagate-joint-matrix-usage", SYCLPropagateJointMatrixUsagePa
 MODULE_PASS("sycl-add-opt-level-attribute", SYCLAddOptLevelAttributePass())
 MODULE_PASS("compile-time-properties", CompileTimePropertiesPass())
 MODULE_PASS("cleanup-sycl-metadata", CleanupSYCLMetadataPass())
+MODULE_PASS("sycl-create-nvvm-annotations", SYCLCreateNVVMAnnotationsPass())
 MODULE_PASS("lower-slm-reservation-calls", ESIMDLowerSLMReservationCalls())
 MODULE_PASS("record-sycl-aspect-names", RecordSYCLAspectNamesPass())
 MODULE_PASS("sycl-virtual-functions-analysis",
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -60,6 +60,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   SpecConstants.cpp
   SYCLAddOptLevelAttribute.cpp
   SYCLConditionalCallOnDevice.cpp
+  SYCLCreateNVVMAnnotations.cpp
   SYCLDeviceLibReqMask.cpp
   SYCLDeviceRequirements.cpp
   SYCLKernelParamOptInfo.cpp
diff --git a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp
@@ -12,6 +12,7 @@
 #include "llvm/SYCLLowerIR/DeviceGlobals.h"
 #include "llvm/SYCLLowerIR/ESIMD/ESIMDUtils.h"
 #include "llvm/SYCLLowerIR/HostPipes.h"
+#include "llvm/SYCLLowerIR/TargetHelpers.h"
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/StringMap.h"
@@ -367,7 +368,8 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
     SmallVector<StringRef, 3> ValStrs;
     Attr.getValueAsString().split(ValStrs, ',');
 
-    assert(ValStrs.size() <= 3 &&
+    size_t NumDims = ValStrs.size();
+    assert(NumDims <= 3 &&
            "sycl-work-group-size and sycl-work-group-size-hint currently only "
            "support up to three values");
 
@@ -384,6 +386,16 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
     for (StringRef ValStr : ValStrs)
       MDVals.push_back(ConstantAsMetadata::get(
           Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, ValStr, 10))));
+    while (MDVals.size() < 3)
+      MDVals.push_back(ConstantAsMetadata::get(
+          Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, 1, 10))));
+
+    if (NumDims < 3) {
+      if (!F.hasMetadata("work_group_num_dim"))
+        F.setMetadata("work_group_num_dim",
+                      MDNode::get(Ctx, ConstantAsMetadata::get(ConstantInt::get(
+                                           Type::getInt32Ty(Ctx), NumDims))));
+    }
 
     const char *MDName = (AttrKindStr == "sycl-work-group-size")
                              ? "reqd_work_group_size"
@@ -588,9 +600,13 @@ PreservedAnalyses CompileTimePropertiesPass::run(Module &M,
   }
 
   // Process all properties on kernels.
+  TargetHelpers::KernelCache HIPCUDAKCache;
+  HIPCUDAKCache.populateKernels(M);
+
   for (Function &F : M) {
     // Only consider kernels.
-    if (F.getCallingConv() != CallingConv::SPIR_KERNEL)
+    if (F.getCallingConv() != CallingConv::SPIR_KERNEL &&
+        !HIPCUDAKCache.isKernel(F))
       continue;
 
     // Compile time properties on kernel arguments
diff --git a/llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp b/llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp
diff --git a/llvm/test/SYCLLowerIR/CompileTimePropertiesPass/kernel-properties.ll b/llvm/test/SYCLLowerIR/CompileTimePropertiesPass/kernel-properties.ll
diff --git a/llvm/test/SYCLLowerIR/nvvm-annotations.ll b/llvm/test/SYCLLowerIR/nvvm-annotations.ll
diff --git a/sycl/test-e2e/Basic/work_group_size_prop.cpp b/sycl/test-e2e/Basic/work_group_size_prop.cpp
diff --git a/sycl/test-e2e/Graph/Explicit/work_group_size_prop.cpp b/sycl/test-e2e/Graph/Explicit/work_group_size_prop.cpp
diff --git a/sycl/test-e2e/Graph/RecordReplay/work_group_size_prop.cpp b/sycl/test-e2e/Graph/RecordReplay/work_group_size_prop.cpp