-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[OpenMP] Use generic IR for the OpenMP DeviceRTL #119091
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang-driver @llvm/pr-subscribers-clang Author: Joseph Huber (jhuber6) ChangesSummary: Full diff: https://github.com/llvm/llvm-project/pull/119091.diff 4 Files Affected:
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 7d3d7f8f03c491..3dd90ecf8bca4c 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2800,8 +2800,7 @@ void tools::addOpenMPDeviceRTL(const Driver &D,
: options::OPT_libomptarget_nvptx_bc_path_EQ;
StringRef ArchPrefix = Triple.isAMDGCN() ? "amdgpu" : "nvptx";
- std::string LibOmpTargetName =
- ("libomptarget-" + ArchPrefix + "-" + BitcodeSuffix + ".bc").str();
+ std::string LibOmpTargetName = ("libomptarget-" + ArchPrefix + ".bc").str();
// First check whether user specifies bc library
if (const Arg *A = DriverArgs.getLastArg(LibomptargetBCPathOpt)) {
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 102794829795da..214f1e5d83478f 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -851,7 +851,6 @@ void CudaToolChain::addClangTargetOptions(
HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
- assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
DeviceOffloadingKind == Action::OFK_Cuda) &&
"Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 1bf3eb9da38aa1..cda633c41062b6 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -42,38 +42,6 @@ set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
set(include_directory ${devicertl_base_directory}/include)
set(source_directory ${devicertl_base_directory}/src)
-set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
- "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx950;gfx1010"
- "gfx1012;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035"
- "gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150"
- "gfx1151;gfx1152;gfx1153")
-set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
- "sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90")
-set(all_gpu_architectures
- "${all_amdgpu_architectures};${all_nvptx_architectures}")
-
-set(LIBOMPTARGET_DEVICE_ARCHITECTURES "all" CACHE STRING
- "List of device architectures to be used to compile the OpenMP DeviceRTL.")
-
-if(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "all")
- set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_gpu_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "amdgpu")
- set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_amdgpu_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "nvptx")
- set(LIBOMPTARGET_DEVICE_ARCHITECTURES ${all_nvptx_architectures})
-elseif(LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "auto" OR
- LIBOMPTARGET_DEVICE_ARCHITECTURES STREQUAL "native")
- if(NOT LIBOMPTARGET_NVPTX_ARCH AND NOT LIBOMPTARGET_AMDGPU_ARCH)
- message(FATAL_ERROR
- "Could not find 'amdgpu-arch' and 'nvptx-arch' tools required for 'auto'")
- elseif(NOT LIBOMPTARGET_FOUND_NVIDIA_GPU AND NOT LIBOMPTARGET_FOUND_AMDGPU_GPU)
- message(FATAL_ERROR "No AMD or NVIDIA GPU found on the system when using 'auto'")
- endif()
- set(LIBOMPTARGET_DEVICE_ARCHITECTURES
- "${LIBOMPTARGET_NVPTX_DETECTED_ARCH_LIST};${LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST}")
-endif()
-list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
-
set(include_files
${include_directory}/Allocator.h
${include_directory}/Configuration.h
@@ -141,20 +109,21 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
# first create an object target
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
-function(compileDeviceRTLLibrary target_cpu target_name target_triple)
+function(compileDeviceRTLLibrary target_name target_triple)
set(target_bc_flags ${ARGN})
set(bc_files "")
foreach(src ${src_files})
get_filename_component(infile ${src} ABSOLUTE)
get_filename_component(outfile ${src} NAME)
- set(outfile "${outfile}-${target_cpu}.bc")
+ set(outfile "${outfile}-${target_name}.bc")
set(depfile "${outfile}.d")
add_custom_command(OUTPUT ${outfile}
COMMAND ${CLANG_TOOL}
${bc_flags}
- --offload-arch=${target_cpu}
+ -fopenmp-targets=${target_triple}
+ -Xopenmp-target=${target_triple} -march=
${target_bc_flags}
-MD -MF ${depfile}
${infile} -o ${outfile}
@@ -177,7 +146,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
list(APPEND bc_files ${outfile})
endforeach()
- set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
+ set(bclib_name "libomptarget-${target_name}.bc")
# Link to a bitcode library.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
@@ -217,7 +186,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
APPEND)
endif()
- set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
+ set(bclib_target_name "omptarget-${target_name}-bc")
add_custom_target(${bclib_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
# Copy library to destination.
@@ -239,7 +208,7 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
# Package the bitcode in the bitcode and embed it in an ELF for the static library
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
COMMAND ${PACKAGER_TOOL} -o ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
- "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=${target_cpu},kind=openmp"
+ "--image=file=${CMAKE_CURRENT_BINARY_DIR}/${bclib_name},${target_feature},triple=${target_triple},arch=generic,kind=openmp"
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
COMMENT "Packaging LLVM offloading binary ${bclib_name}.out"
)
@@ -249,14 +218,14 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
APPEND)
endif()
- set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}-${target_cpu}.o")
+ set(output_name "${CMAKE_CURRENT_BINARY_DIR}/devicertl-${target_name}.o")
add_custom_command(OUTPUT ${output_name}
COMMAND ${CLANG_TOOL} --std=c++17 -c -nostdlib
-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name}
-o ${output_name}
${source_directory}/Stub.cpp
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/packaged_${bclib_name} ${source_directory}/Stub.cpp
- COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}-${target_cpu}.o"
+ COMMENT "Embedding LLVM offloading binary in devicertl-${target_name}.o"
VERBATIM
)
if(TARGET clang)
@@ -269,11 +238,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
set_property(TARGET omptarget.devicertl.all_objs APPEND PROPERTY IMPORTED_OBJECTS ${output_name})
if (CMAKE_EXPORT_COMPILE_COMMANDS)
- set(ide_target_name omptarget-ide-${target_name}-${target_cpu})
+ set(ide_target_name omptarget-ide-${target_name})
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
target_compile_options(${ide_target_name} PRIVATE
- -fopenmp --offload-arch=${target_cpu} -fopenmp-cuda-mode
- -mllvm -openmp-opt-disable
+ -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
+ -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
-foffload-lto -fvisibility=hidden --offload-device-only
-nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
)
@@ -288,18 +257,11 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
endif()
endfunction()
-# Generate a Bitcode library for all the gpu architectures the user requested.
-add_custom_target(omptarget.devicertl.nvptx)
add_custom_target(omptarget.devicertl.amdgpu)
-foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
- if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
- compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
- elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
- compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
- else()
- message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
- endif()
-endforeach()
+compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
+
+add_custom_target(omptarget.devicertl.nvptx)
+compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
# Archive all the object files generated above into a static library
add_library(omptarget.devicertl STATIC)
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 57df159d3f28e5..34aee7bf2e37c1 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -44,7 +44,6 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
ShuffleReductFnTy shflFct) {
uint32_t size, remote_id, physical_lane_id;
@@ -63,7 +62,6 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
} while (logical_lane_id % 2 == 0 && size > 1);
return (logical_lane_id == 0);
}
-#endif
static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
ShuffleReductFnTy shflFct,
@@ -74,49 +72,52 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
uint32_t NumThreads = omp_get_num_threads();
if (NumThreads == 1)
return 1;
- /*
- * This reduce function handles reduction within a team. It handles
- * parallel regions in both L1 and L2 parallelism levels. It also
- * supports Generic, SPMD, and NoOMP modes.
- *
- * 1. Reduce within a warp.
- * 2. Warp master copies value to warp 0 via shared memory.
- * 3. Warp 0 reduces to a single value.
- * 4. The reduced value is available in the thread that returns 1.
- */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- uint32_t WarpsNeeded =
- (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
- uint32_t WarpId = mapping::getWarpIdInBlock();
-
- // Volta execution model:
- // For the Generic execution mode a parallel region either has 1 thread and
- // beyond that, always a multiple of 32. For the SPMD execution mode we may
- // have any number of threads.
- if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
- gpu_irregular_warp_reduce(reduce_data, shflFct,
- /*LaneCount=*/NumThreads % mapping::getWarpSize(),
- /*LaneId=*/mapping::getThreadIdInBlock() %
- mapping::getWarpSize());
-
- // When we have more than [mapping::getWarpSize()] number of threads
- // a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
- if (NumThreads > mapping::getWarpSize()) {
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
+ /*
+ * This reduce function handles reduction within a team. It handles
+ * parallel regions in both L1 and L2 parallelism levels. It also
+ * supports Generic, SPMD, and NoOMP modes.
+ *
+ * 1. Reduce within a warp.
+ * 2. Warp master copies value to warp 0 via shared memory.
+ * 3. Warp 0 reduces to a single value.
+ * 4. The reduced value is available in the thread that returns 1.
+ */
+
+#if __has_builtin(__nvvm_reflect)
+ if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
+ uint32_t WarpsNeeded =
+ (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+ uint32_t WarpId = mapping::getWarpIdInBlock();
+
+ // Volta execution model:
+ // For the Generic execution mode a parallel region either has 1 thread and
+ // beyond that, always a multiple of 32. For the SPMD execution mode we may
+ // have any number of threads.
+ if ((NumThreads % mapping::getWarpSize() == 0) ||
+ (WarpId < WarpsNeeded - 1))
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+ gpu_irregular_warp_reduce(
+ reduce_data, shflFct,
+ /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+ /*LaneId=*/mapping::getThreadIdInBlock() % mapping::getWarpSize());
+
+ // When we have more than [mapping::getWarpSize()] number of threads
+ // a block reduction is performed here.
+ //
+ // Only L1 parallel region can enter this if condition.
+ if (NumThreads > mapping::getWarpSize()) {
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
+
+ if (WarpId == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+ BlockThreadId);
+ }
+ return BlockThreadId == 0;
}
- return BlockThreadId == 0;
-#else
+#endif
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
if (Liveness == lanes::All) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
@@ -150,10 +151,9 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
return BlockThreadId == 0;
}
- // Get the OMP thread Id. This is different from BlockThreadId in the case of
- // an L2 parallel region.
+ // Get the OMP thread Id. This is different from BlockThreadId in the case
+ // of an L2 parallel region.
return BlockThreadId == 0;
-#endif // __CUDA_ARCH__ >= 700
}
uint32_t roundToWarpsize(uint32_t s) {
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
set(depfile "${outfile}.d") | ||
|
||
add_custom_command(OUTPUT ${outfile} | ||
COMMAND ${CLANG_TOOL} | ||
${bc_flags} | ||
--offload-arch=${target_cpu} | ||
-fopenmp-targets=${target_triple} | ||
-Xopenmp-target=${target_triple} -march= |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and nothing after -march=
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Intentional
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so which means there is no arch?
0df5574
to
4c710e4
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like this method, but just out of curiosity, did we use anything in AMDGPU implementation that has target dependent lowering in the front end? If not, this is totally fine I'd say.
We used to use the |
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, | ||
BlockThreadId); | ||
#if __has_builtin(__nvvm_reflect) | ||
if (__nvvm_reflect("__CUDA_ARCH") >= 700) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll try to make an AMDGPU counterpart for this one, though it doesn't look like necessary for the purpose of OpenMP device runtime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe @AlexVlx also has interest in that space, so best consult with him as well.
And now we don't have any target specific part in AMDGPU implementation? Quite surprised but happy to hear that. |
Worth a bullet point in release note, both clang and openmp/offload. |
67e1c65
to
ae742d6
Compare
f868bbf
to
51c5926
Compare
@hidekisaito Might be relevant to your patch. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it seems like there are some test failures but I think the PR looks good
@ronlieb This will probably conflict a lot with downstream, should probably wait until I've talked it over with others before trying to merge it in AOMP. |
yes please , lets wait until internal monday mtg to discuss. also i would like to help with some precommit performance testing using a downstream build. |
compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none) | ||
|
||
add_custom_target(omptarget.devicertl.nvptx) | ||
compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we have an option to disable either of amdgpu/nvptx specifically? If anything, because it feels weird having a choice between amdgpu and nvptx plugins, but no choice between devicertl variants.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't feel like it was strictly necessary since if you can build one you can build the other. The only thing you save is disk space, but if you feel like it's really needed I can re-use the architectures thing or something.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't exactly feel strongly about it. It's just that we already provide a switch for amdgpu/nvptx in Gentoo, so it only feels natural for this to respect it.
Also, on a semi-related matter: is there a reason we're installing both separate .bc
files and libomptarget.devicertl.a
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is really over-complicated, but right now we only use the .bc
file for non-LTO NVPTX compilations which get put through each TU in a broken way via -mlink-builtin-bitcode
. This is because it would get really slow if we didn't optimize out calls to the runtime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, so it's not accidental/obsolete — that's all I needed to know :-).
Another thing I've noticed that clang needs libomptarget.devicertl.a
even when no GPU target is actually used. I've worked around that for now by creating an empty .a
file locally, but perhaps that should also be handled better somehow (I can make a PR later if you tell me how you'd like it handled).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose that happens for the CPU targets? Those aren't really used anywhere real, they're just for testing, but even so we probably shouldn't leave them broken. We link the device RTL unconditionally because the link step shouldn't need to know the architectures that were used to compile it. Because these are static libraries they're not extracted if they aren't needed, so it doesn't hurt anything if they're unused. But I guess you can have a situation where the user doesn't build this intentionally, but that's non-default behavior so I've never given it much thought.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah. precisely. I've noticed this while building -DLIBOMPTARGET_BUILD_DEVICERTL_BCLIB=OFF -DLIBOMPTARGET_PLUGINS_TO_BUILD=host
to run tests on CPU only.
This probably should retain separate wave32/wave64 builds. Additionally, should have extension points for subtarget specific implementations |
That's what Shilei was talking about since we have |
First I will always consider NVVM reflect a giant hack. NVVM reflect cannot actually deal with the full range of wavesize issues. It is an incompatible ABI and the code should never be intermixed |
It's a hack, but still better than whatever it is AMD does currently. |
Does wave32/wave64 affect the lowering from front end source code to middle end IR, if we don't use certain functions, such as wave level primitives? |
Yes. This is a fixed ABI setting that can change the binary function signature (which then has knock on optimization effects). The IR is a representation of what goes on a final .o. It is not really intended to enable new features or make the program portable. |
My .02 is, it depends on how to think about it. If we eliminate the "unwanted" part (especially those that could potentially affect ABI) at the very beginning of the middle end, it will not have optimization effects. In addition, the compiler backend lowering is essentially doing something like:
If speaking from "portability", the compiler is "portable". The reflection is just to give end user an interface to do similar things:
Of course the interface(s) have to be carefully designed and all the queried information can't be changed during optimization. |
54660d4
to
ccbbc8c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM (thanks for pinging mgorny and I), notwithstanding mgorny's comment, but we could handle that as a followup too.
Summary: We previously built this for every single architecture to deal with incompatibility. This patch updates it to use the 'generic' IR that `libc` and other projects use. Who knows if this will have any side-effects, probably worth testing more but it passes the tests I expect to pass on my side. update with comment release notes fix
ccbbc8c
to
767d34a
Compare
merge and lands downstream version 34f8573 [OpenMP] Use generic IR for the OpenMP DeviceRTL (llvm#119091) Change-Id: Iae07e43219bb424945ae20102f3f690249a73454
Summary:
We previously built this for every single architecture to deal with
incompatibility. This patch updates it to use the 'generic' IR that
libc
and other projects use. Who knows if this will have anyside-effects, probably worth testing more but it passes the tests I
expect to pass on my side.