[OpenMP] Use generic IR for the OpenMP DeviceRTL #119091

jhuber6 · 2024-12-07T19:51:20Z

Summary:
We previously built this for every single architecture to deal with
incompatibility. This patch updates it to use the 'generic' IR that
libc and other projects use. Who knows if this will have any
side-effects, probably worth testing more but it passes the tests I
expect to pass on my side.

llvmbot · 2024-12-07T19:51:53Z

@llvm/pr-subscribers-clang-driver
@llvm/pr-subscribers-offload

@llvm/pr-subscribers-clang

Author: Joseph Huber (jhuber6)

Changes

Summary:
We previously built this for every single architecture to deal with
incompatibility. This patch updates it to use the 'generic' IR that
libc and other projects use. Who knows if this will have any
side-effects, probably worth testing more but it passes the tests I
expect to pass on my side.

Full diff: https://github.com/llvm/llvm-project/pull/119091.diff

4 Files Affected:

(modified) clang/lib/Driver/ToolChains/CommonArgs.cpp (+1-2)
(modified) clang/lib/Driver/ToolChains/Cuda.cpp (-1)
(modified) offload/DeviceRTL/CMakeLists.txt (+16-54)
(modified) offload/DeviceRTL/src/Reduction.cpp (+47-47)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 7d3d7f8f03c491..3dd90ecf8bca4c 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2800,8 +2800,7 @@ void tools::addOpenMPDeviceRTL(const Driver &D,
                         : options::OPT_libomptarget_nvptx_bc_path_EQ;
 
   StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx";
-  std::string LibOmpTargetName =
-      ("libomptarget-" + ArchPrefix + "-" + BitcodeSuffix + ".bc").str();
+  std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + ".bc").str();
 
   // First check whether user specifies bc library
   if (const Arg *A = DriverArgs.getLastArg(LibomptargetBCPathOpt)) {
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 102794829795da..214f1e5d83478f 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -851,7 +851,6 @@ void CudaToolChain::addClangTargetOptions(
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
   StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
-  assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
   assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
           DeviceOffloadingKind == Action::OFK_Cuda) &&
          "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 1bf3eb9da38aa1..cda633c41062b6 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -42,38 +42,6 @@ set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
 set(include_directory ${devicertl_base_directory}/include)
 set(source_directory ${devicertl_base_directory}/src)
 
-set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx950;gfx1010"
-                             "gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035"
-                             "gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150"
-                             "gfx1151;gfx1152;gfx1153")
-set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
-                            "sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90")
-set(all_gpu_architectures
-    "${all_amdgpu_architectures};${all_nvptx_architectures}")
-
-set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING
-    "List of device architectures to be used to compile the OpenMP DeviceRTL.")
-
-if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all")
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "amdgpu")
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_amdgpu_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "nvptx")
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_nvptx_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
-       LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native")
-  if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH)
-    message(FATAL_ERROR
-      "Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'")
-  elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU)
-    message(FATAL_ERROR "No AMD or NVIDIA GPU found on the system when using 'auto'")
-  endif()
-  set(LIBOMPTARGET_DEVICE_ARCHITECTURES
-      "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
-endif()
-list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
-
 set(include_files
   ${include_directory}/Allocator.h
   ${include_directory}/Configuration.h
@@ -141,20 +109,21 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
 
 # first create an object target
 add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
-function(compileDeviceRTLLibrary target_cpu target_name target_triple)
+function(compileDeviceRTLLibrary target_name target_triple)
   set(target_bc_flags ${ARGN})
 
   set(bc_files "")
   foreach(src ${src_files})
     get_filename_component(infile ${src} ABSOLUTE)
     get_filename_component(outfile ${src} NAME)
-    set(outfile "${outfile}-${target_cpu}.bc")
+    set(outfile "${outfile}-${target_name}.bc")
     set(depfile "${outfile}.d")
 
     add_custom_command(OUTPUT ${outfile}
       COMMAND ${CLANG_TOOL}
       ${bc_flags}
-      --offload-arch=${target_cpu}
+      -fopenmp-targets=${target_triple}
+      -Xopenmp-target=${target_triple} -march=
       ${target_bc_flags}
       -MD -MF ${depfile}
       ${infile} -o ${outfile}
@@ -177,7 +146,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
     list(APPEND bc_files ${outfile})
   endforeach()
 
-  set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
+  set(bclib_name "libomptarget-${target_name}.bc")
 
   # Link to a bitcode library.
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
@@ -217,7 +186,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
       APPEND)
   endif()
 
-  set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
+  set(bclib_target_name "omptarget-${target_name}-bc")
   add_custom_target(${bclib_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
 
   # Copy library to destination.
@@ -239,7 +208,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
   # Package the bitcode in the bitcode and embed it in an ELF for the static library
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
       COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
-        "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp"
+        "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=generic,kind=openmp"
       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
       COMMENT "Packaging LLVM offloading binary ${bclib_name}.out"
   )
@@ -249,14 +218,14 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
       APPEND)
   endif()
 
-  set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o")
+  set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}.o")
   add_custom_command(OUTPUT ${output_name}
     COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib
             -Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
             -o ${output_name}
             ${source_directory}/Stub.cpp
     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} ${source_directory}/Stub.cpp
-    COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o"
+    COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}.o"
     VERBATIM
   )
   if(TARGET clang)
@@ -269,11 +238,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
   set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name})
 
   if (CMAKE_EXPORT_COMPILE_COMMANDS)
-    set(ide_target_name omptarget-ide-${target_name}-${target_cpu})
+    set(ide_target_name omptarget-ide-${target_name})
     add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
     target_compile_options(${ide_target_name} PRIVATE
-      -fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode
-      -mllvm -openmp-opt-disable
+      -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
+      -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
       -foffload-lto -fvisibility=hidden --offload-device-only
       -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
     )
@@ -288,18 +257,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
   endif()
 endfunction()
 
-# Generate a Bitcode library for all the gpu architectures the user requested.
-add_custom_target(omptarget.devicertl.nvptx)
 add_custom_target(omptarget.devicertl.amdgpu)
-foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
-  if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
-  elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
-  else()
-    message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
-  endif()
-endforeach()
+compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
+
+add_custom_target(omptarget.devicertl.nvptx)
+compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
 
 # Archive all the object files generated above into a static library
 add_library(omptarget.devicertl STATIC)
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 57df159d3f28e5..34aee7bf2e37c1 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
   }
 }
 
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
                                           ShuffleReductFnTy shflFct) {
   uint32_t size, remote_id, physical_lane_id;
@@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
   } while (logical_lane_id % 2 == 0 && size > 1);
   return (logical_lane_id == 0);
 }
-#endif
 
 static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                             ShuffleReductFnTy shflFct,
@@ -74,49 +72,52 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   uint32_t NumThreads = omp_get_num_threads();
   if (NumThreads == 1)
     return 1;
-    /*
-     * This reduce function handles reduction within a team. It handles
-     * parallel regions in both L1 and L2 parallelism levels. It also
-     * supports Generic, SPMD, and NoOMP modes.
-     *
-     * 1. Reduce within a warp.
-     * 2. Warp master copies value to warp 0 via shared memory.
-     * 3. Warp 0 reduces to a single value.
-     * 4. The reduced value is available in the thread that returns 1.
-     */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded =
-      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  uint32_t WarpId = mapping::getWarpIdInBlock();
-
-  // Volta execution model:
-  // For the Generic execution mode a parallel region either has 1 thread and
-  // beyond that, always a multiple of 32. For the SPMD execution mode we may
-  // have any number of threads.
-  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
-
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
+  /*
+   * This reduce function handles reduction within a team. It handles
+   * parallel regions in both L1 and L2 parallelism levels. It also
+   * supports Generic, SPMD, and NoOMP modes.
+   *
+   * 1. Reduce within a warp.
+   * 2. Warp master copies value to warp 0 via shared memory.
+   * 3. Warp 0 reduces to a single value.
+   * 4. The reduced value is available in the thread that returns 1.
+   */
+
+#if __has_builtin(__nvvm_reflect)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
+    uint32_t WarpsNeeded =
+        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    uint32_t WarpId = mapping::getWarpIdInBlock();
+
+    // Volta execution model:
+    // For the Generic execution mode a parallel region either has 1 thread and
+    // beyond that, always a multiple of 32. For the SPMD execution mode we may
+    // have any number of threads.
+    if ((NumThreads % mapping::getWarpSize() == 0) ||
+        (WarpId < WarpsNeeded - 1))
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+    else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+      gpu_irregular_warp_reduce(
+          reduce_data, shflFct,
+          /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+          /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
+
+    // When we have more than [mapping::getWarpSize()] number of threads
+    // a block reduction is performed here.
+    //
+    // Only L1 parallel region can enter this if condition.
+    if (NumThreads > mapping::getWarpSize()) {
+      // Gather all the reduced values from each warp
+      // to the first warp.
+      cpyFct(reduce_data, WarpsNeeded);
+
+      if (WarpId == 0)
+        gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                  BlockThreadId);
+    }
+    return BlockThreadId == 0;
   }
-  return BlockThreadId == 0;
-#else
+#endif
   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
   if (Liveness == lanes::All) // Full warp
     gpu_regular_warp_reduce(reduce_data, shflFct);
@@ -150,10 +151,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
     return BlockThreadId == 0;
   }
 
-  // Get the OMP thread Id. This is different from BlockThreadId in the case of
-  // an L2 parallel region.
+  // Get the OMP thread Id. This is different from BlockThreadId in the case
+  // of an L2 parallel region.
   return BlockThreadId == 0;
-#endif // __CUDA_ARCH__ >= 700
 }
 
 uint32_t roundToWarpsize(uint32_t s) {

github-actions · 2024-12-07T19:54:31Z

✅ With the latest revision this PR passed the C/C++ code formatter.

shiltian · 2024-12-07T19:55:07Z

offload/DeviceRTL/CMakeLists.txt

    set(depfile "${outfile}.d")

    add_custom_command(OUTPUT ${outfile}
      COMMAND ${CLANG_TOOL}
      ${bc_flags}
-      --offload-arch=${target_cpu}
+      -fopenmp-targets=${target_triple}
+      -Xopenmp-target=${target_triple} -march=


and nothing after -march=?

Intentional

so which means there is no arch?

shiltian

I like this method, but just out of curiosity, did we use anything in AMDGPU implementation that has target dependent lowering in the front end? If not, this is totally fine I'd say.

jhuber6 · 2024-12-07T19:57:23Z

I like this method, but just out of curiosity, did we use anything in AMDGPU implementation that has target dependent lowering in the front end? If not, this is totally fine I'd say.

We used to use the __AMDGCN_WAVEFRONT_SIZE but that was removed for unrelated reasons.

shiltian · 2024-12-07T19:58:50Z

offload/DeviceRTL/src/Reduction.cpp

-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
+#if __has_builtin(__nvvm_reflect)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700) {


I'll try to make an AMDGPU counterpart for this one, though it doesn't look like necessary for the purpose of OpenMP device runtime.

I believe @AlexVlx also has interest in that space, so best consult with him as well.

shiltian · 2024-12-07T19:59:44Z

I like this method, but just out of curiosity, did we use anything in AMDGPU implementation that has target dependent lowering in the front end? If not, this is totally fine I'd say.

We used to use the __AMDGCN_WAVEFRONT_SIZE but that was removed for unrelated reasons.

And now we don't have any target specific part in AMDGPU implementation? Quite surprised but happy to hear that.

shiltian · 2024-12-07T20:00:56Z

Worth a bullet point in release note, both clang and openmp/offload.

jhuber6 · 2024-12-07T20:23:10Z

@hidekisaito Might be relevant to your patch.

shiltian

it seems like there are some test failures but I think the PR looks good

jhuber6 · 2024-12-07T21:49:31Z

@ronlieb This will probably conflict a lot with downstream, should probably wait until I've talked it over with others before trying to merge it in AOMP.

ronlieb · 2024-12-07T21:55:34Z

@ronlieb This will probably conflict a lot with downstream, should probably wait until I've talked it over with others before trying to merge it in AOMP.

yes please , lets wait until internal monday mtg to discuss. also i would like to help with some precommit performance testing using a downstream build.

mgorny · 2024-12-08T12:55:58Z

offload/DeviceRTL/CMakeLists.txt

+compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
+
+add_custom_target(omptarget.devicertl.nvptx)
+compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)


Can we have an option to disable either of amdgpu/nvptx specifically? If anything, because it feels weird having a choice between amdgpu and nvptx plugins, but no choice between devicertl variants.

I didn't feel like it was strictly necessary since if you can build one you can build the other. The only thing you save is disk space, but if you feel like it's really needed I can re-use the architectures thing or something.

I don't exactly feel strongly about it. It's just that we already provide a switch for amdgpu/nvptx in Gentoo, so it only feels natural for this to respect it.

Also, on a semi-related matter: is there a reason we're installing both separate .bc files and libomptarget.devicertl.a?

This is really over-complicated, but right now we only use the .bc file for non-LTO NVPTX compilations which get put through each TU in a broken way via -mlink-builtin-bitcode. This is because it would get really slow if we didn't optimize out calls to the runtime.

Ok, so it's not accidental/obsolete — that's all I needed to know :-).

Another thing I've noticed that clang needs libomptarget.devicertl.a even when no GPU target is actually used. I've worked around that for now by creating an empty .a file locally, but perhaps that should also be handled better somehow (I can make a PR later if you tell me how you'd like it handled).

I suppose that happens for the CPU targets? Those aren't really used anywhere real, they're just for testing, but even so we probably shouldn't leave them broken. We link the device RTL unconditionally because the link step shouldn't need to know the architectures that were used to compile it. Because these are static libraries they're not extracted if they aren't needed, so it doesn't hurt anything if they're unused. But I guess you can have a situation where the user doesn't build this intentionally, but that's non-default behavior so I've never given it much thought.

Yeah. precisely. I've noticed this while building -DLIBOMPTARGET_BUILD_DEVICERTL_BCLIB=OFF -DLIBOMPTARGET_PLUGINS_TO_BUILD=host to run tests on CPU only.

arsenm · 2024-12-08T14:33:19Z

This probably should retain separate wave32/wave64 builds. Additionally, should have extension points for subtarget specific implementations

jhuber6 · 2024-12-08T14:34:37Z

This probably should retain separate wave32/wave64 builds. Additionally, should have extension points for subtarget specific implementations

That's what Shilei was talking about since we have __nvvm_reflect for that for NVPTX.

arsenm · 2024-12-08T14:36:51Z

This probably should retain separate wave32/wave64 builds. Additionally, should have extension points for subtarget specific implementations

That's what Shilei was talking about since we have __nvvm_reflect for that for NVPTX.

First I will always consider NVVM reflect a giant hack. NVVM reflect cannot actually deal with the full range of wavesize issues. It is an incompatible ABI and the code should never be intermixed

jhuber6 · 2024-12-08T14:38:04Z

First I will always consider NVVM reflect a giant hack. NVVM reflect cannot actually deal with the full range of wavesize issues. It is an incompatible ABI and the code should never be intermixed

It's a hack, but still better than whatever it is AMD does currently.

shiltian · 2024-12-08T15:10:29Z

This probably should retain separate wave32/wave64 builds. Additionally, should have extension points for subtarget specific implementations

Does wave32/wave64 affect the lowering from front end source code to middle end IR, if we don't use certain functions, such as wave level primitives?

arsenm · 2024-12-08T20:43:56Z

Does wave32/wave64 affect the lowering from front end source code to middle end IR, if we don't use certain functions, such as wave level primitives?

Yes. This is a fixed ABI setting that can change the binary function signature (which then has knock on optimization effects). The IR is a representation of what goes on a final .o. It is not really intended to enable new features or make the program portable.

shiltian · 2024-12-08T21:34:56Z

The IR is a representation of what goes on a final .o. It is not really intended to enable new features or make the program portable.

My .02 is, it depends on how to think about it. If we eliminate the "unwanted" part (especially those that could potentially affect ABI) at the very beginning of the middle end, it will not have optimization effects.

In addition, the compiler backend lowering is essentially doing something like:

if (target == A)
  lower Inst to I1
else if (has_feature_that_only_compiler_can_call("..."))
  lower Inst to I2
...

If speaking from "portability", the compiler is "portable". The reflection is just to give end user an interface to do similar things:

if (__builtin_get_target() == A)
  do something in one way
else if (has_feature_that_user_can_call("..."))
  do something in another way
...

Of course the interface(s) have to be carefully designed and all the queried information can't be changed during optimization.

thesamesam

LGTM (thanks for pinging mgorny and I), notwithstanding mgorny's comment, but we could handle that as a followup too.

Summary: We previously built this for every single architecture to deal with incompatibility. This patch updates it to use the 'generic' IR that `libc` and other projects use. Who knows if this will have any side-effects, probably worth testing more but it passes the tests I expect to pass on my side. update with comment release notes fix

mgorny · 2024-12-23T11:45:50Z

@jhuber6, since I think this PR makes #119006 obsolete, should I merge that one while this one's still in the works, or close it?

jhuber6 · 2024-12-23T13:52:53Z

@jhuber6, since I think this PR makes #119006 obsolete, should I merge that one while this one's still in the works, or close it?

I wouldn't say it's high priority. I would've landed this already if it weren't for the downstream fork being so divergent this isn't an easy apply.

jhuber6 · 2024-12-23T17:54:49Z

@ronlieb https://gist.github.com/jhuber6/26e3fc8473a750410d2ca1db4a65918d

merge and lands downstream version 34f8573 [OpenMP] Use generic IR for the OpenMP DeviceRTL (llvm#119091) Change-Id: Iae07e43219bb424945ae20102f3f690249a73454

jhuber6 requested review from arsenm, jdoerfert, JonChesterfield, jplehr, mgorny, ronlieb, saiislam, shiltian and thesamesam December 7, 2024 19:51

llvmbot added clang Clang issues not falling into any other category clang:driver 'clang' and 'clang++' user-facing binaries. Not 'clang-cl' offload labels Dec 7, 2024

shiltian reviewed Dec 7, 2024

View reviewed changes

jhuber6 force-pushed the GenericDeviceRTL branch from 0df5574 to 4c710e4 Compare December 7, 2024 19:55

shiltian reviewed Dec 7, 2024

View reviewed changes

jhuber6 force-pushed the GenericDeviceRTL branch from 67e1c65 to ae742d6 Compare December 7, 2024 20:01

llvmbot added openmp:libomp OpenMP host runtime openmp:libomptarget OpenMP offload runtime labels Dec 7, 2024

jhuber6 force-pushed the GenericDeviceRTL branch from f868bbf to 51c5926 Compare December 7, 2024 20:23

shiltian approved these changes Dec 7, 2024

View reviewed changes

mgorny reviewed Dec 8, 2024

View reviewed changes

jhuber6 force-pushed the GenericDeviceRTL branch from 54660d4 to ccbbc8c Compare December 9, 2024 16:31

hidekisaito mentioned this pull request Dec 9, 2024

[Offload][AMDGPU] accept generic target #118919

Merged

thesamesam approved these changes Dec 9, 2024

View reviewed changes

jhuber6 mentioned this pull request Dec 17, 2024

[Driver][clang-linker-wrapper] Add initial support for OpenMP offloading to generic SPIR-V #120145

Merged

jhuber6 force-pushed the GenericDeviceRTL branch from ccbbc8c to 767d34a Compare December 23, 2024 04:39

No break time

3c8cd44

ronlieb approved these changes Dec 24, 2024

View reviewed changes

jhuber6 merged commit 34f8573 into llvm:main Dec 25, 2024
9 checks passed

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Dec 25, 2024

merge main into amd-staging

6b6cba8

merge and lands downstream version 34f8573 [OpenMP] Use generic IR for the OpenMP DeviceRTL (llvm#119091) Change-Id: Iae07e43219bb424945ae20102f3f690249a73454

jtramm mentioned this pull request Jan 6, 2025

[NVPTX] Do not run the NVVMReflect pass as part of the normal pipeline #121834

Merged

jhuber6 mentioned this pull request Feb 17, 2025

[libc] How do I build clang for OpenMP program with the GPU C library? #127458

Closed

[OpenMP] Use generic IR for the OpenMP DeviceRTL #119091

[OpenMP] Use generic IR for the OpenMP DeviceRTL #119091

Uh oh!

Conversation

jhuber6 commented Dec 7, 2024

Uh oh!

llvmbot commented Dec 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Dec 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

shiltian left a comment

Choose a reason for hiding this comment

Uh oh!

jhuber6 commented Dec 7, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

shiltian commented Dec 7, 2024

Uh oh!

shiltian commented Dec 7, 2024

Uh oh!

jhuber6 commented Dec 7, 2024

Uh oh!

shiltian left a comment

Choose a reason for hiding this comment

Uh oh!

jhuber6 commented Dec 7, 2024

Uh oh!

ronlieb commented Dec 7, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm commented Dec 8, 2024

Uh oh!

jhuber6 commented Dec 8, 2024

Uh oh!

arsenm commented Dec 8, 2024

Uh oh!

jhuber6 commented Dec 8, 2024

Uh oh!

shiltian commented Dec 8, 2024

Uh oh!

arsenm commented Dec 8, 2024

Uh oh!

shiltian commented Dec 8, 2024

Uh oh!

thesamesam left a comment

Choose a reason for hiding this comment

Uh oh!

mgorny commented Dec 23, 2024

Uh oh!

jhuber6 commented Dec 23, 2024

Uh oh!

jhuber6 commented Dec 23, 2024

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Dec 7, 2024 •

edited

Loading

github-actions bot commented Dec 7, 2024 •

edited

Loading