intel
diff --git a/‎llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
Lines changed: 8 additions & 0 deletions b/‎llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎llvm/test/tools/sycl-post-link/emit_intel_reqd_sub_group_size.ll
Lines changed: 24 additions & 0 deletions b/‎llvm/test/tools/sycl-post-link/emit_intel_reqd_sub_group_size.ll
Lines changed: 24 additions & 0 deletions
diff --git a/‎sycl/test-e2e/SubGroup/attributes.cpp
Lines changed: 9 additions & 134 deletions b/‎sycl/test-e2e/SubGroup/attributes.cpp
Lines changed: 9 additions & 134 deletions
diff --git a/‎sycl/test-e2e/SubGroup/attributes_cuda_hip.cpp
Lines changed: 15 additions & 0 deletions b/‎sycl/test-e2e/SubGroup/attributes_cuda_hip.cpp
Lines changed: 15 additions & 0 deletions
diff --git a/‎sycl/test-e2e/SubGroup/attributes_helper.hpp
Lines changed: 165 additions & 0 deletions b/‎sycl/test-e2e/SubGroup/attributes_helper.hpp
Lines changed: 165 additions & 0 deletions
diff --git a/‎unified-runtime/source/adapters/cuda/kernel.cpp
Lines changed: 9 additions & 4 deletions b/‎unified-runtime/source/adapters/cuda/kernel.cpp
Lines changed: 9 additions & 4 deletions
@@ -314,6 +314,14 @@ PropSetRegTy computeModuleProperties(const Module &M,
                     KernelReqdWorkGroupSize);
       }
 
+      if (auto ReqdSubGroupSize = getKernelSingleEltMetadata<uint32_t>(
+              Func, "intel_reqd_sub_group_size")) {
+        // intel_reqd_sub_group_size is stored as i32.
+        MetadataNames.push_back(Func.getName().str() + "@reqd_sub_group_size");
+        PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(),
+                    *ReqdSubGroupSize);
+      }
+
       if (auto WorkGroupNumDim = getKernelSingleEltMetadata<uint32_t>(
               Func, "work_group_num_dim")) {
         MetadataNames.push_back(Func.getName().str() + "@work_group_num_dim");
 
@@ -0,0 +1,24 @@
+; This test checks that the sycl-post-link tool correctly handles
+; intel_reqd_sub_group_size metadata.
+
+; RUN: sycl-post-link -properties -emit-program-metadata -device-globals -S < %s -o %t.files.table
+; RUN: FileCheck %s -input-file=%t.files.table --check-prefixes CHECK-TABLE
+; RUN: FileCheck %s -input-file=%t.files_0.prop --match-full-lines --check-prefixes CHECK-PROP
+
+target triple = "amdgcn-amd-amdhsa"
+
+!0 = !{i32 64}
+
+define weak_odr amdgpu_kernel void @_ZTS7Kernel1(float %arg1) !intel_reqd_sub_group_size !0 {
+  call void @foo(float %arg1)
+  ret void
+}
+
+declare void @foo(float)
+
+; CHECK-PROP: [SYCL/program metadata]
+; CHECK-PROP-NEXT: _ZTS7Kernel1@reqd_sub_group_size=1|64
+
+; CHECK-TABLE: [Code|Properties]
+; CHECK-TABLE-NEXT: {{.*}}files_0.prop
+; CHECK-TABLE-EMPTY:
@@ -1,10 +1,13 @@
-// TODO: Despite using a supported required subgroup size compile_sub_group_size
-// reports as 0 on cuda and hip
-// XFAIL: target-nvidia || target-amd
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14357
+// UNSUPPORTED: target-amd, target-nvidia
+// UNSUPPORTED-INTENDED: This test is not meant to be run on CUDA/HIP. Instead
+// `attributes_cuda_hip.cpp` is designed to test those backends. This is needed
+// as the CI is set up such that it only builds a test once for all available
+// devices, this is not suitable, as GPU targets will compile-time-check the
+// sub-group size and error out if it is not correct.
 
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
+
 //==------- attributes.cpp - SYCL sub_group attributes test ----*- C++ -*---==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -13,134 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "helper.hpp"
-
-#define KERNEL_FUNCTOR_WITH_SIZE(SIZE)                                         \
-  class KernelFunctor##SIZE {                                                  \
-  public:                                                                      \
-    [[sycl::reqd_sub_group_size(SIZE)]] void                                   \
-    operator()(sycl::nd_item<1> Item) const {                                  \
-      const auto GID = Item.get_global_id();                                   \
-    }                                                                          \
-  };
-
-KERNEL_FUNCTOR_WITH_SIZE(1);
-KERNEL_FUNCTOR_WITH_SIZE(2);
-KERNEL_FUNCTOR_WITH_SIZE(4);
-KERNEL_FUNCTOR_WITH_SIZE(8);
-KERNEL_FUNCTOR_WITH_SIZE(16);
-KERNEL_FUNCTOR_WITH_SIZE(32);
-KERNEL_FUNCTOR_WITH_SIZE(64);
-
-#undef KERNEL_FUNCTOR_WITH_SIZE
-
-inline uint32_t flp2(uint32_t X) {
-  X = X | (X >> 1);
-  X = X | (X >> 2);
-  X = X | (X >> 4);
-  X = X | (X >> 8);
-  X = X | (X >> 16);
-  return X - (X >> 1);
-}
-
-template <typename Fn> inline void submit(sycl::queue &Q) {
-  Q.submit([](sycl::handler &cgh) {
-    Fn F;
-    cgh.parallel_for(sycl::nd_range<1>{64, 16}, F);
-  });
-}
-
-int main() {
-  queue Queue;
-  device Device = Queue.get_device();
-
-  try {
-    const auto SGSizes = Device.get_info<info::device::sub_group_sizes>();
-
-    for (const auto SGSize : SGSizes) {
-      // Get the previous power of 2
-      auto ReqdSize = flp2(SGSize);
-
-      std::cout << "Run for " << ReqdSize << " required workgroup size.\n";
-
-      // Store the `sycl::kernel` into a vector because `sycl::kernel`
-      // doesn't have default constructor
-      std::vector<sycl::kernel> TheKernel;
-
-      switch (ReqdSize) {
-      case 64: {
-        auto KernelID = sycl::get_kernel_id<KernelFunctor64>();
-        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-            Queue.get_context(), {KernelID});
-        TheKernel.push_back(KB.get_kernel(KernelID));
-        submit<KernelFunctor64>(Queue);
-        break;
-      }
-      case 32: {
-        auto KernelID = sycl::get_kernel_id<KernelFunctor32>();
-        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-            Queue.get_context(), {KernelID});
-        TheKernel.push_back(KB.get_kernel(KernelID));
-        submit<KernelFunctor32>(Queue);
-        break;
-      }
-      case 16: {
-        auto KernelID = sycl::get_kernel_id<KernelFunctor16>();
-        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-            Queue.get_context(), {KernelID});
-        TheKernel.push_back(KB.get_kernel(KernelID));
-        submit<KernelFunctor16>(Queue);
-        break;
-      }
-      case 8: {
-        auto KernelID = sycl::get_kernel_id<KernelFunctor8>();
-        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-            Queue.get_context(), {KernelID});
-        TheKernel.push_back(KB.get_kernel(KernelID));
-        submit<KernelFunctor8>(Queue);
-        break;
-      }
-      case 4: {
-        auto KernelID = sycl::get_kernel_id<KernelFunctor4>();
-        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-            Queue.get_context(), {KernelID});
-        TheKernel.push_back(KB.get_kernel(KernelID));
-        submit<KernelFunctor4>(Queue);
-        break;
-      }
-      case 2: {
-        auto KernelID = sycl::get_kernel_id<KernelFunctor2>();
-        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-            Queue.get_context(), {KernelID});
-        TheKernel.push_back(KB.get_kernel(KernelID));
-        submit<KernelFunctor2>(Queue);
-        break;
-      }
-      case 1: {
-        auto KernelID = sycl::get_kernel_id<KernelFunctor1>();
-        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-            Queue.get_context(), {KernelID});
-        TheKernel.push_back(KB.get_kernel(KernelID));
-        submit<KernelFunctor1>(Queue);
-        break;
-      }
-      default:
-        throw sycl::exception(sycl::errc::feature_not_supported,
-                              "sub-group size is not supported");
-      }
-
-      auto Kernel = TheKernel[0];
-
-      auto Res = Kernel.get_info<
-          sycl::info::kernel_device_specific::compile_sub_group_size>(Device);
-
-      exit_if_not_equal<size_t>(Res, ReqdSize, "compile_sub_group_size");
-    }
-  } catch (exception e) {
-    std::cout << "SYCL exception caught: " << e.what();
-    return 1;
-  }
+#include "attributes_helper.hpp"
 
-  std::cout << "Test passed.\n";
-  return 0;
-}
+int main() { return runTests(); }
@@ -0,0 +1,15 @@
+// REQUIRES: cuda || hip
+// RUN: %{build} -DBUILD_FOR_GPU -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+//==- attributes_cuda_hip.cpp - SYCL sub_group attributes test -*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "attributes_helper.hpp"
+
+int main() { return runTests(); }
@@ -0,0 +1,165 @@
+//==- attributes_helper.hpp - SYCL sub_group attributes helper -*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+
+#define KERNEL_FUNCTOR_WITH_SIZE(SIZE)                                         \
+  class KernelFunctor##SIZE {                                                  \
+  public:                                                                      \
+    [[sycl::reqd_sub_group_size(SIZE)]] void                                   \
+    operator()(sycl::nd_item<1> Item) const {                                  \
+      const auto GID = Item.get_global_id();                                   \
+    }                                                                          \
+  };
+
+// Dummy kernel, so we get the types and can keep later code straight-lined.
+#define DUMMY_KERNEL_FUNCTOR(SIZE)                                             \
+  class KernelFunctor##SIZE {                                                  \
+  public:                                                                      \
+    void operator()(sycl::nd_item<1> Item) const {                             \
+      const auto GID = Item.get_global_id();                                   \
+    }                                                                          \
+  };
+
+#ifdef BUILD_FOR_GPU
+DUMMY_KERNEL_FUNCTOR(1);
+DUMMY_KERNEL_FUNCTOR(2);
+DUMMY_KERNEL_FUNCTOR(4);
+DUMMY_KERNEL_FUNCTOR(8);
+DUMMY_KERNEL_FUNCTOR(16);
+KERNEL_FUNCTOR_WITH_SIZE(32);
+DUMMY_KERNEL_FUNCTOR(64);
+#else
+KERNEL_FUNCTOR_WITH_SIZE(1);
+KERNEL_FUNCTOR_WITH_SIZE(2);
+KERNEL_FUNCTOR_WITH_SIZE(4);
+KERNEL_FUNCTOR_WITH_SIZE(8);
+KERNEL_FUNCTOR_WITH_SIZE(16);
+KERNEL_FUNCTOR_WITH_SIZE(32);
+KERNEL_FUNCTOR_WITH_SIZE(64);
+#endif
+
+#undef KERNEL_FUNCTOR_WITH_SIZE
+
+inline uint32_t flp2(uint32_t X) {
+  X = X | (X >> 1);
+  X = X | (X >> 2);
+  X = X | (X >> 4);
+  X = X | (X >> 8);
+  X = X | (X >> 16);
+  return X - (X >> 1);
+}
+
+template <typename Fn> inline void submit(sycl::queue &Q) {
+  Q.submit([](sycl::handler &cgh) {
+    Fn F;
+    cgh.parallel_for(sycl::nd_range<1>{64, 16}, F);
+  });
+}
+
+int runTests() {
+  queue Queue;
+  device Device = Queue.get_device();
+
+  try {
+    const auto SGSizes = Device.get_info<info::device::sub_group_sizes>();
+
+    for (const auto SGSize : SGSizes) {
+      // Get the previous power of 2
+      auto ReqdSize = flp2(SGSize);
+
+      std::cout << "Run for " << ReqdSize << " required workgroup size.\n";
+
+      // Store the `sycl::kernel` into a vector because `sycl::kernel`
+      // doesn't have default constructor
+      std::vector<sycl::kernel> TheKernel;
+
+      switch (ReqdSize) {
+      case 64: {
+        auto KernelID = sycl::get_kernel_id<KernelFunctor64>();
+        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Queue.get_context(), {KernelID});
+        TheKernel.push_back(KB.get_kernel(KernelID));
+        submit<KernelFunctor64>(Queue);
+        break;
+      }
+      case 32: {
+        auto KernelID = sycl::get_kernel_id<KernelFunctor32>();
+        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Queue.get_context(), {KernelID});
+        TheKernel.push_back(KB.get_kernel(KernelID));
+        submit<KernelFunctor32>(Queue);
+        break;
+      }
+      case 16: {
+        auto KernelID = sycl::get_kernel_id<KernelFunctor16>();
+        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Queue.get_context(), {KernelID});
+        TheKernel.push_back(KB.get_kernel(KernelID));
+        submit<KernelFunctor16>(Queue);
+        break;
+      }
+      case 8: {
+        auto KernelID = sycl::get_kernel_id<KernelFunctor8>();
+        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Queue.get_context(), {KernelID});
+        TheKernel.push_back(KB.get_kernel(KernelID));
+        submit<KernelFunctor8>(Queue);
+        break;
+      }
+      case 4: {
+        auto KernelID = sycl::get_kernel_id<KernelFunctor4>();
+        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Queue.get_context(), {KernelID});
+        TheKernel.push_back(KB.get_kernel(KernelID));
+        submit<KernelFunctor4>(Queue);
+        break;
+      }
+      case 2: {
+        auto KernelID = sycl::get_kernel_id<KernelFunctor2>();
+        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Queue.get_context(), {KernelID});
+        TheKernel.push_back(KB.get_kernel(KernelID));
+        submit<KernelFunctor2>(Queue);
+        break;
+      }
+      case 1: {
+        auto KernelID = sycl::get_kernel_id<KernelFunctor1>();
+        auto KB = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            Queue.get_context(), {KernelID});
+        TheKernel.push_back(KB.get_kernel(KernelID));
+        submit<KernelFunctor1>(Queue);
+        break;
+      }
+      default:
+        throw sycl::exception(sycl::errc::feature_not_supported,
+                              "sub-group size is not supported");
+      }
+
+      auto Kernel = TheKernel[0];
+
+      auto Res = Kernel.get_info<
+          sycl::info::kernel_device_specific::compile_sub_group_size>(Device);
+
+#ifdef BUILD_FOR_GPU
+      // GPU targets only test this one size, override the value, so the check
+      // passes and the code path don't diverge.
+      if (ReqdSize != 32)
+        ReqdSize = 0;
+#endif
+
+      exit_if_not_equal<size_t>(Res, ReqdSize, "compile_sub_group_size");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    return 1;
+  }
+
+  std::cout << "Test passed.\n";
+  return 0;
+}
@@ -339,10 +339,15 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
     return ReturnValue(0);
   }
   case UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL: {
-    // Return value of 0 => unspecified or "auto" sub-group size
-    // Correct for now, since warp size may be read from special register
-    // TODO: Return warp size once default is primary sub-group size
-    // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
+    const auto &KernelReqdSubGroupSizeMap =
+        hKernel->getProgram()->KernelReqdSubGroupSizeMD;
+    // If present, return the value of intel_reqd_sub_group_size metadata, if
+    // not: 0, which stands for unspecified or auto sub-group size.
+    if (auto KernelReqdSubGroupSize =
+            KernelReqdSubGroupSizeMap.find(hKernel->getName());
+        KernelReqdSubGroupSize != KernelReqdSubGroupSizeMap.end())
+      return ReturnValue(KernelReqdSubGroupSize->second);
+
     return ReturnValue(0);
   }
   default: