Skip to content

Commit fe6e75c

Browse files
jhuber6ronlieb
authored andcommitted
[openmp] - Use mlink-builtin-bitcode for libomptarget.
- No longer use llvm-link for linking of ocml.bc and ockl.bc for libomptarget. - Created Platform.h which uses aliaes so clang can emit libdevice control constants with ODRLinkage to allow usage of mlink-builtin-bitcode. - Move libm inside of DeviceRTL which takes care of some undefined symbols left in user applications. - Added Platform.h to hostexec_stubs which resolved and -O0 compilation issue. - prep-libomptarget-bc has a new function that only marks ockl_dm_alloc and ockl_dm_dealloc as linkonce_odr to properly use our version of ockl_devmem_request to levarage hostexec. - Remove --attributor-enable=module from DeviceRTL as it was causing LibM sin/cos functions to be over optimized causing NaN values. Change-Id: I4fc7023a61a3aa407b0f71e95a689dbda2aab15b
1 parent 42b6a1d commit fe6e75c

File tree

23 files changed

+398
-355
lines changed

23 files changed

+398
-355
lines changed

clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,12 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
455455
if (DriverArgs.hasArg(options::OPT_nogpulib))
456456
return;
457457

458+
for (auto BCFile : getDeviceLibs(DriverArgs)) {
459+
CC1Args.push_back(BCFile.ShouldInternalize ? "-mlink-builtin-bitcode"
460+
: "-mlink-bitcode-file");
461+
CC1Args.push_back(DriverArgs.MakeArgString(BCFile.Path));
462+
}
463+
458464
ArgStringList LibraryPaths;
459465

460466
// Find in --hip-device-lib-path and HIP_LIBRARY_PATH.

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9198,17 +9198,17 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
91989198
const char *LinkingOutput) const {
91999199
bool isAMDGPU = false;
92009200
auto offloadTC = C.getOffloadToolChains(Action::OFK_OpenMP);
9201-
const auto openMPTCs = llvm::make_range(offloadTC.first, offloadTC.second);
9201+
const auto OpenMPTCs = llvm::make_range(offloadTC.first, offloadTC.second);
92029202
const ToolChain *TC;
9203-
for (auto &I : openMPTCs) {
9203+
for (auto &I : OpenMPTCs) {
92049204
TC = I.second;
92059205
if (TC->getTriple().isAMDGPU()) {
92069206
isAMDGPU = true;
92079207
break;
92089208
}
92099209
}
92109210

9211-
if (!openMPTCs.empty() &&
9211+
if (!OpenMPTCs.empty() &&
92129212
Args.hasFlag(options::OPT_opaque_offload_linker,
92139213
options::OPT_no_opaque_offload_linker, isAMDGPU)) {
92149214
ConstructOpaqueJob(C, JA, Output, Inputs, Args, TC->getTriple(),
@@ -9233,35 +9233,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
92339233
"--cuda-path=" + CudaInstallation.getInstallPath()));
92349234
break;
92359235
}
9236-
if (TC->getTriple().isAMDGPU()) {
9237-
RocmInstallationDetector RocmInstallation(D, TheTriple, Args, true,
9238-
true);
9239-
const llvm::Triple triple = TC->getTriple();
9240-
const auto GPUArch = TC->getTargetID().str();
9241-
const auto ArchKind = llvm::AMDGPU::parseArchAMDGCN(TC->getTargetID());
9242-
9243-
bool AsanGpuRT = Args.hasFlag(options::OPT_fgpu_sanitize,
9244-
options::OPT_fno_gpu_sanitize, true);
9245-
9246-
llvm::SmallVector<std::string, 12> BCLibs =
9247-
amdgpu::dlr::getCommonDeviceLibNames(
9248-
Args, D, GPUArch, /* isOpenMP */ true, RocmInstallation);
9249-
9250-
SmallVector<std::string> subarchs;
9251-
addSubArchsWithTargetID(C, Args, triple, subarchs);
9252-
9253-
std::set<std::string> bitcodeTarget;
9254-
for (const auto &sa : subarchs) {
9255-
bitcodeTarget.insert("openmp-" + triple.str() + "-" +
9256-
getProcessorFromTargetID(triple, sa).str());
9257-
}
9258-
9259-
for (StringRef prefix : bitcodeTarget)
9260-
for (auto BCLib : BCLibs)
9261-
CmdArgs.push_back(Args.MakeArgString("--bitcode-library=" + prefix +
9262-
"=" + BCLib));
9263-
break;
9264-
}
92659236
}
92669237
}
92679238

clang/lib/Headers/__clang_cuda_math.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,22 @@
2424
#pragma push_macro("__DEVICE__")
2525
#ifdef __OPENMP_NVPTX__
2626
#if defined(__cplusplus)
27+
#ifdef __BUILD_MATH_BUILTINS_LIB__
28+
#include <limits.h>
29+
#define HUGE_VALF (__builtin_huge_valf())
30+
#define HUGE_VAL (__builtin_huge_val())
31+
#define __DEVICE__ extern "C" __attribute__((always_inline, nothrow))
32+
#else
2733
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
34+
#endif // __BUILD_MATH_BUILTINS_LIB__
2835
#else
2936
// Use __BUILD_MATH_BUILTINS_LIB__ to build device specific libm-nvptx.bc
3037
// for FORTRAN bitcode linking since FORTRAN cannot use c headers.
3138
#ifdef __BUILD_MATH_BUILTINS_LIB__
3239
#include <limits.h>
3340
#define HUGE_VALF (__builtin_huge_valf())
3441
#define HUGE_VAL (__builtin_huge_val())
35-
#define __DEVICE__ extern __attribute__((always_inline, nothrow, cold, weak))
42+
#define __DEVICE__ extern __attribute__((always_inline, nothrow))
3643
#else
3744
#define __DEVICE__ static __attribute__((always_inline, nothrow))
3845
#endif // __BUILD_MATH_BUILTINS_LIB__

clang/lib/Headers/__clang_hip_math.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,17 @@
2929

3030
#ifdef __OPENMP_AMDGCN__
3131
#if defined(__cplusplus)
32+
#ifdef __BUILD_MATH_BUILTINS_LIB__
33+
#define __DEVICE__ extern "C" __attribute__((always_inline, nothrow))
34+
#define __DEVICE_NOCE__ __DEVICE__
35+
#else
3236
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
3337
#define __DEVICE_NOCE__ static __attribute__((always_inline, nothrow))
38+
#endif
3439
#else // !defined(__cplusplus), c openmp compilation
3540
// Special case to build c-only device function lib for FORTRAN.
3641
#ifdef __BUILD_MATH_BUILTINS_LIB__
37-
#define __DEVICE__ extern __attribute__((always_inline, nothrow, cold, weak))
42+
#define __DEVICE__ extern __attribute__((always_inline, nothrow))
3843
#define __DEVICE_NOCE__ __DEVICE__
3944
#else
4045
#define __DEVICE__ static __attribute__((always_inline, nothrow))
@@ -1280,7 +1285,7 @@ double __fma_rn(double __x, double __y, double __z) {
12801285
_Generic((__x), float : __signbitf, double : __signbit)(__x)
12811286
#endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L
12821287

1283-
#if defined(__cplusplus)
1288+
#if defined(__cplusplus) && !defined(__BUILD_MATH_BUILTINS_LIB__)
12841289
#ifndef __OPENMP_AMDGCN__
12851290
template <class T> __DEVICE__ T min(T __arg1, T __arg2) {
12861291
return (__arg1 < __arg2) ? __arg1 : __arg2;

openmp/libomptarget/CMakeLists.txt

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,17 +189,10 @@ set(LIBOMPTARGET_LLVM_LIBRARY_INTDIR "${LIBOMPTARGET_INTDIR}" CACHE STRING
189189

190190
# Build offloading plugins and device RTLs if they are available.
191191
add_subdirectory(plugins-nextgen)
192-
# Currently device libm functions are created by clang headers. These do not
193-
# work for FORTRAN. The libm here builds a libm device library by turning on
194-
# __BUILD_MATH_BUILTINS_LIB__ This converts the math functions in the clang
195-
# headers from static to extern to build device linkable libm for FORTRAN.
196-
# Eventually, we can make a linkable device libm for c and c++ and remove a
197-
# lot of header definitions that get compiled with every offload compilation.
198-
add_subdirectory(libm)
199192

200-
add_subdirectory(hostexec)
201193
add_subdirectory(tools)
202194
set(PREP_TOOL $<TARGET_FILE:prep-libomptarget-bc>)
195+
add_subdirectory(hostexec)
203196
add_subdirectory(DeviceRTL)
204197

205198
# Build target agnostic offloading library.

openmp/libomptarget/DeviceRTL/CMakeLists.txt

Lines changed: 60 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ list(REMOVE_DUPLICATES LIBOMPTARGET_DEVICE_ARCHITECTURES)
9393
set(include_files
9494
${include_directory}/Allocator.h
9595
${include_directory}/Configuration.h
96+
${include_directory}/Platform.h
9697
${include_directory}/Debug.h
9798
${include_directory}/Interface.h
9899
${include_directory}/LibC.h
@@ -111,6 +112,7 @@ set(src_files
111112
${source_directory}/Debug.cpp
112113
${source_directory}/Kernel.cpp
113114
${source_directory}/LibC.cpp
115+
${source_directory}/LibM.cpp
114116
${source_directory}/Mapping.cpp
115117
${source_directory}/Misc.cpp
116118
${source_directory}/Parallelism.cpp
@@ -133,7 +135,7 @@ set(src_files
133135
# propagation. That said, we will run the vectorizer again after the runtime
134136
# has been linked into the user program.
135137
set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
136-
set(link_opt_flags -O3 -openmp-opt-disable -attributor-enable=module -vectorize-slp=false )
138+
set(link_opt_flags -O3 -openmp-opt-disable -vectorize-slp=false )
137139
set(link_export_flag -passes=internalize -internalize-public-api-file=${source_directory}/exports)
138140

139141
# Prepend -I to each list element
@@ -143,9 +145,12 @@ list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
143145
# Set flags for LLVM Bitcode compilation.
144146
set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
145147
${clang_opt_flags} --offload-device-only
146-
-nocudalib -nogpulib -nostdinc
148+
-nogpuinc -nogpulib
147149
-fopenmp -fopenmp-cuda-mode
148150
-Wno-unknown-cuda-version
151+
-I${CMAKE_BINARY_DIR}/openmp/runtime/src # Need omp.h for LibM.
152+
-I${CMAKE_BINARY_DIR}/projects/openmp/runtime/src # Need omp.h for LibM.
153+
-I${CMAKE_BINARY_DIR}/runtime/src
149154
-DOMPTARGET_DEVICE_RUNTIME
150155
-I${include_directory}
151156
-I${devicertl_base_directory}/../include
@@ -158,82 +163,6 @@ else()
158163
list(APPEND bc_flags -DOMPTARGET_DEBUG=0)
159164
endif()
160165

161-
function(addAMDSpecificBcLibs touch_target gfxname bc_files local_depend_files)
162-
# For amdgpu, the libomptarget bc is "all inclusive".
163-
# During user compilation, the libomptarget bc is essentially the only
164-
# non-user library linked. It is linked once in GPU link phase
165-
# following llvm-link options: --internalize --only-needed
166-
if(NOT amd_device_libs_found)
167-
find_package(AMDDeviceLibs REQUIRED CONFIG
168-
HINTS
169-
${CMAKE_BINARY_DIR}/../../tools/rocm-device-libs
170-
${CMAKE_INSTALL_PREFIX}
171-
PATHS
172-
/opt/rocm)
173-
if(AMDDeviceLibs_DIR)
174-
set(amd_device_libs_found ON)
175-
libomptarget_say("DeviceRTLs ${gfxname}: Getting ROCm device libs from ${AMDDeviceLibs_DIR}")
176-
else()
177-
libomptarget_say("DeviceRTLs ${gfxname}: Not building AMDGCN device RTL: Could not find AMDDeviceLibs package")
178-
return()
179-
endif()
180-
endif()
181-
get_target_property(ockl_bc_file ockl LOCATION)
182-
get_target_property(ocml_bc_file ocml LOCATION)
183-
set(amdgpu_wfsz_is32 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103)
184-
string(FIND "${amdgpu_wfsz_is32}" "${gfxname}" is_32bit)
185-
if(NOT is_32bit EQUAL -1)
186-
get_target_property(oclc_wf_bc_file oclc_wavefrontsize64_off LOCATION)
187-
else()
188-
get_target_property(oclc_wf_bc_file oclc_wavefrontsize64_on LOCATION)
189-
endif()
190-
string(LENGTH "${gfxname}" gfxlen)
191-
if(gfxlen EQUAL 6)
192-
string(SUBSTRING ${gfxname} 3 3 gfxnum)
193-
else()
194-
string(SUBSTRING ${gfxname} 3 4 gfxnum)
195-
endif()
196-
get_target_property(oclc_isa_bc_file oclc_isa_version_${gfxnum} LOCATION)
197-
198-
# Add custom target so targets from other directories
199-
# can be added as dependencies to ensure libm
200-
# and libhostexec bc files have been built.
201-
add_custom_target(${touch_target} ALL)
202-
add_dependencies(${touch_target}
203-
libm-target-${gfxname}
204-
libhostexec-${gfxname}.bc
205-
)
206-
207-
# TODO: Add back -amdgpu to the names below (maybe?).
208-
list(APPEND bc_files
209-
${CMAKE_BINARY_DIR}/libm-${gfxname}.bc
210-
# ${CMAKE_BINARY_DIR}/openmp/libomptarget/hostexec/libhostexec-${gfxname}.bc
211-
)
212-
if(OPENMP_STANDALONE_BUILD)
213-
list(APPEND bc_files
214-
${CMAKE_BINARY_DIR}/libomptarget/hostexec/libhostexec-${gfxname}.bc
215-
)
216-
else()
217-
list(APPEND bc_files
218-
${CMAKE_BINARY_DIR}/openmp/libomptarget/hostexec/libhostexec-${gfxname}.bc
219-
)
220-
endif()
221-
if (EXISTS ${CMAKE_BINARY_DIR}/../../tools/ROCMDEVLIBS)
222-
add_dependencies(${touch_target}
223-
ockl ocml oclc_wavefrontsize64_on oclc_wavefrontsize64_off oclc_isa_version_${gfxnum})
224-
endif()
225-
226-
# Add amdgcn-specific bc files to link command
227-
list(APPEND bc_files ${ocml_bc_file} ${ockl_bc_file} ${oclc_wf_bc_file} ${oclc_isa_bc_file})
228-
229-
# Add touch-target-$(target_cpu) to local_depend_files so all $bc_files will exist or be created.
230-
list(APPEND local_depend_files ${touch_target})
231-
232-
# Update these values in the caller:
233-
set (bc_files ${bc_files} PARENT_SCOPE)
234-
set (local_depend_files ${local_depend_files} PARENT_SCOPE)
235-
endfunction()
236-
237166
# first create an object target
238167
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
239168
function(compileDeviceRTLLibrary target_cpu target_name target_triple)
@@ -257,6 +186,20 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
257186
COMMENT "Building LLVM bitcode ${outfile}"
258187
VERBATIM
259188
)
189+
190+
if(${outfile} MATCHES "State.cpp")
191+
# Run the prep tool on the library to replace internal attribute with linkonce_odr for dm_alloc only.
192+
set(outfile_prep "${outfile}-${target_cpu}-prep.bc")
193+
add_custom_target(${outfile_prep}
194+
COMMAND ${PREP_TOOL} -dm ${outfile}
195+
-o ${outfile_prep}
196+
DEPENDS ${outfile}
197+
COMMENT "Running ${PREP_TOOL} for ${outfile_prep}"
198+
)
199+
add_dependencies(${outfile_prep} ${outfile})
200+
set(outfile ${outfile_prep})
201+
endif()
202+
260203
if("${CLANG_TOOL}" STREQUAL "$<TARGET_FILE:clang>")
261204
# Add a file-level dependency to ensure that clang is up-to-date.
262205
# By default, add_custom_command only builds clang if the
@@ -268,21 +211,34 @@ function(compileDeviceRTLLibrary target_cpu target_name target_triple)
268211
list(APPEND bc_files ${outfile})
269212
endforeach()
270213

271-
set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
272-
273-
set(local_depend_files ${bc_files})
274-
if( ${target_name} STREQUAL "amdgpu" )
275-
addAMDSpecificBcLibs("touch-target-${target_cpu}" ${target_cpu} "${bc_files}" "${local_depend_files}")
214+
# Link in the previously compiled 'hostexec' bitcode directly.
215+
if("${target_name}" STREQUAL "amdgpu")
216+
if(OPENMP_STANDALONE_BUILD)
217+
list(APPEND extra_bc_files
218+
${CMAKE_BINARY_DIR}/libomptarget/hostexec/libhostexec-${target_cpu}.bc
219+
)
220+
else()
221+
list(APPEND extra_bc_files
222+
${CMAKE_BINARY_DIR}/openmp/libomptarget/hostexec/libhostexec-${target_cpu}.bc
223+
)
224+
endif()
225+
add_custom_target(libhostexec-${target_cpu}
226+
DEPENDS ${extra_bc_files}
227+
)
228+
add_dependencies(libhostexec-${target_cpu} libhostexec-${target_cpu}.bc)
229+
set(extra_depends libhostexec-${target_cpu})
276230
endif()
277231

232+
set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
233+
278234
# Link to a bitcode library.
279235
add_custom_target(linked_${bclib_name}
280236
COMMAND ${LINK_TOOL}
281-
-o ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} ${bc_files}
282-
DEPENDS ${bc_files}
237+
-o ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name} ${extra_bc_files} ${bc_files}
238+
DEPENDS ${bc_files} ${extra_depends}
283239
COMMENT "Linking LLVM bitcode ${bclib_name}"
284240
)
285-
add_dependencies(linked_${bclib_name} ${local_depend_files})
241+
add_dependencies(linked_${bclib_name} ${bc_files} ${extra_depends})
286242
if("${LINK_TOOL}" STREQUAL "$<TARGET_FILE:llvm-link>")
287243
add_dependencies(linked_${bclib_name} llvm-link)
288244
endif()
@@ -411,8 +367,24 @@ add_custom_target(omptarget.devicertl.nvptx)
411367
add_custom_target(omptarget.devicertl.amdgpu)
412368
foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
413369
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
414-
set(clang_options -DLIBOMPTARGET_BC_TARGET -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=${gpu_arch} -DLIBOMPTARGET_BC_TARGET -D__AMDGCN__ -nogpulib)
415-
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none ${clang_options})
370+
find_package(AMDDeviceLibs REQUIRED CONFIG
371+
HINTS ${CMAKE_INSTALL_PREFIX}
372+
${CMAKE_BINARY_DIR}/../../tools/rocm-device-libs
373+
PATHS /opt/rocm
374+
)
375+
376+
# Link in the ROCm Device Libraries once the other files have been linked.
377+
get_target_property(ocml_path ocml IMPORTED_LOCATION)
378+
get_target_property(ockl_path ockl IMPORTED_LOCATION)
379+
380+
set(amd_options -Xclang -mcode-object-version=none
381+
-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}
382+
-Xclang -mlink-builtin-bitcode -Xclang ${ockl_path}
383+
-Wno-linker-warnings # Silence the empty host compilation.
384+
-Xclang -mcode-object-version=none
385+
)
386+
387+
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa ${amd_options})
416388
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
417389
compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
418390
else()

openmp/libomptarget/DeviceRTL/include/Debug.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
4343

4444
#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
4545
#define PRINT(str) PRINTF("%s", str)
46-
4746
///}
4847

4948
#endif

openmp/libomptarget/DeviceRTL/include/DevRTLExtras.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,6 @@ typedef struct {
7979
uintptr_t value;
8080
} omp_alloctrait_t;
8181

82-
// Attribute to keep alive certain definition for the bitcode library.
83-
#ifdef LIBOMPTARGET_BC_TARGET
84-
#define KEEP_ALIVE __attribute__((used, retain))
85-
#else
86-
#define KEEP_ALIVE
87-
#endif
88-
8982
///}
9083

9184
#endif // OPENMP_LIBOMPTARGET_DEVICERTL_INCLUDE_DEVRTLEXTRAS_H

0 commit comments

Comments
 (0)