Skip to content

Commit 6d0e137

Browse files
committed
[libc] Remove OpenMP and build the GPU libc directly
The current `libcgpu.a` is actually an archive of fatbinaries. The host file contains nothing but a section called `LLVM_OFFLOADING` that contains embedded device code. This used to be handled implicitly by borrowing the OpenMP toolchain, which did this packaging internally. Passing the OpenMP flags causes problems with trying to move to testing. This patch pulls this logic out into the CMake and handles it manually. This patch is a lot of noise, but it fundamentally comes down to the following changes. 1. Build the source for every GPU architecture (GPU architectures are generally not backwards compatible) 2. Combine all of these files into a single binary blob 3. Embed that binary blob into a host file 4. Package these host files into a `.a` archive. 5. The device code will be extracted and managed by the offloading linker. Another important point. Right now we are maintaining an important distinction with the GPU build. That is, when we build the exported library we will build for many GPU architectures. However, the internal version will only be built for a single GPU architecture, one that was found on the user's system. This is intended to be used for internal testing, very similar to the current path where `libc` is compiled for a single target triple. Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D143089
1 parent 48560e2 commit 6d0e137

File tree

3 files changed

+215
-63
lines changed

3 files changed

+215
-63
lines changed

libc/cmake/modules/LLVMLibCObjectRules.cmake

Lines changed: 193 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,149 @@ function(_get_common_compile_options output_var flags)
5252
endif()
5353
endif()
5454
if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
55-
list(APPEND compile_options "-fopenmp")
56-
list(APPEND compile_options "-fopenmp-cuda-mode")
57-
foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES})
58-
list(APPEND compile_options "--offload-arch=${gpu_arch}")
59-
endforeach()
6055
list(APPEND compile_options "-nogpulib")
61-
list(APPEND compile_options "-nogpuinc")
6256
list(APPEND compile_options "-fvisibility=hidden")
63-
list(APPEND compile_options "-foffload-lto")
6457
endif()
6558
set(${output_var} ${compile_options} PARENT_SCOPE)
6659
endfunction()
6760

61+
# Builds the entrypoint target for the GPU.
62+
# Usage:
63+
# _build_gpu_entrypoint_objects(
64+
# <target_name>
65+
# SRCS <list of .cpp files>
66+
# HDRS <list of .h files>
67+
# DEPENDS <list of dependencies>
68+
# COMPILE_OPTIONS <optional list of special compile options for this target>
69+
# FLAGS <optional list of flags>
70+
# )
71+
function(_build_gpu_entrypoint_objects fq_target_name)
72+
cmake_parse_arguments(
73+
"ADD_GPU_ENTRYPOINT_OBJ"
74+
"" # No optional arguments
75+
"NAME;CXX_STANDARD" # Single value arguments
76+
"SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS" # Multi value arguments
77+
${ARGN}
78+
)
79+
80+
# The packaged version will be built for every target GPU architecture. We do
81+
# this so we can support multiple accelerators on the same machine.
82+
foreach(gpu_arch ${all_gpu_architectures})
83+
set(gpu_target_name ${fq_target_name}.${gpu_arch})
84+
set(compile_options ${ADD_GPU_ENTRYPOINT_OBJ_COMPILE_OPTIONS})
85+
# Derive the triple from the specified architecture.
86+
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
87+
set(gpu_target_triple "amdgcn-amd-amdhsa")
88+
list(APPEND compile_options "-mcpu=${gpu_arch}")
89+
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
90+
set(gpu_target_triple "nvptx64-nvidia-cuda")
91+
list(APPEND compile_options "-march=${gpu_arch}")
92+
else()
93+
message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
94+
endif()
95+
list(APPEND compile_options "--target=${gpu_target_triple}")
96+
list(APPEND compile_options "-emit-llvm")
97+
98+
# Build the library for this target architecture. We always emit LLVM-IR for
99+
# packaged GPU binaries.
100+
add_library(${gpu_target_name}
101+
EXCLUDE_FROM_ALL
102+
OBJECT
103+
${ADD_GPU_ENTRYPOINT_OBJ_SRCS}
104+
${ADD_GPU_ENTRYPOINT_OBJ_HDRS}
105+
)
106+
107+
target_compile_options(${gpu_target_name} PRIVATE ${compile_options})
108+
target_include_directories(${gpu_target_name} PRIVATE ${include_dirs})
109+
add_dependencies(${gpu_target_name} ${ADD_GPU_ENTRYPOINT_OBJ_DEPENDS})
110+
target_compile_definitions(${gpu_target_name} PRIVATE LLVM_LIBC_PUBLIC_PACKAGING)
111+
112+
# Append this target to a list of images to package into a single binary.
113+
set(input_file $<TARGET_OBJECTS:${gpu_target_name}>)
114+
list(APPEND packager_images
115+
--image=file=${input_file},arch=${gpu_arch},triple=${gpu_target_triple})
116+
list(APPEND gpu_target_names ${gpu_target_name})
117+
endforeach()
118+
119+
# After building the target for the desired GPUs we must package the output
120+
# into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for
121+
# more information.
122+
set(packaged_target_name ${fq_target_name}.__gpu__)
123+
set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.gpubin)
124+
125+
add_custom_command(OUTPUT ${packaged_output_name}
126+
COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
127+
${packager_images} -o ${packaged_output_name}
128+
DEPENDS ${gpu_target_names}
129+
COMMENT "Packaging LLVM offloading binary")
130+
add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name})
131+
132+
# We create an empty 'stub' file for the host to contain the embedded device
133+
# code. This will be packaged into 'libcgpu.a'.
134+
# TODO: In the future we will want to combine every architecture for a target
135+
# into a single bitcode file and use that. For now we simply build for
136+
# every single one and let the offloading linker handle it.
137+
get_filename_component(stub_filename ${ADD_GPU_ENTRYPOINT_OBJ_SRCS} NAME)
138+
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${stub_filename} "// Empty file.\n")
139+
add_library(
140+
${fq_target_name}
141+
# We want an object library as the objects will eventually get packaged into
142+
# an archive (like libcgpu.a).
143+
EXCLUDE_FROM_ALL
144+
OBJECT
145+
"${CMAKE_CURRENT_BINARY_DIR}/${stub_filename}"
146+
)
147+
target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options}
148+
-DLLVM_LIBC_PUBLIC_PACKAGING
149+
-nostdlib -Xclang -fembed-offload-object=${packaged_output_name})
150+
target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
151+
add_dependencies(${fq_target_name} ${full_deps_list} ${packaged_target_name})
152+
153+
set_target_properties(
154+
${fq_target_name}
155+
PROPERTIES
156+
ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
157+
TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
158+
OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
159+
CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
160+
DEPS "${fq_deps_list}"
161+
FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
162+
)
163+
164+
# We only build the internal target for a single supported architecture.
165+
set(internal_target_name ${fq_target_name}.__internal__)
166+
set(include_dirs ${LIBC_BUILD_DIR}/include ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR})
167+
if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU OR
168+
LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
169+
add_library(
170+
${internal_target_name}
171+
EXCLUDE_FROM_ALL
172+
OBJECT
173+
${ADD_ENTRYPOINT_OBJ_SRCS}
174+
${ADD_ENTRYPOINT_OBJ_HDRS}
175+
)
176+
target_compile_options(${internal_target_name} BEFORE PRIVATE
177+
${common_compile_options} --target=${LIBC_GPU_TARGET_TRIPLE})
178+
if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
179+
target_compile_options(${internal_target_name} PRIVATE -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE})
180+
elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
181+
target_compile_options(${internal_target_name} PRIVATE -march=${LIBC_GPU_TARGET_ARCHITECTURE})
182+
endif()
183+
target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
184+
add_dependencies(${internal_target_name} ${full_deps_list})
185+
set_target_properties(
186+
${internal_target_name}
187+
PROPERTIES
188+
CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
189+
FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
190+
)
191+
set_target_properties(
192+
${fq_target_name}
193+
PROPERTIES OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
194+
)
195+
endif()
196+
endfunction()
197+
68198
# Rule which is essentially a wrapper over add_library to compile a set of
69199
# sources to object files.
70200
# Usage:
@@ -127,7 +257,6 @@ function(create_object_library fq_target_name)
127257
if(NOT ADD_OBJECT_CXX_STANDARD)
128258
set(ADD_OBJECT_CXX_STANDARD ${CMAKE_CXX_STANDARD})
129259
endif()
130-
131260
set_target_properties(
132261
${fq_target_name}
133262
PROPERTIES
@@ -350,53 +479,67 @@ function(create_entrypoint_object fq_target_name)
350479
endif()
351480
endif()
352481

353-
add_library(
354-
${internal_target_name}
355-
# TODO: We don't need an object library for internal consumption.
356-
# A future change should switch this to a normal static library.
357-
EXCLUDE_FROM_ALL
358-
OBJECT
359-
${ADD_ENTRYPOINT_OBJ_SRCS}
360-
${ADD_ENTRYPOINT_OBJ_HDRS}
361-
)
362-
target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
363-
target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
364-
add_dependencies(${internal_target_name} ${full_deps_list})
365-
set_target_properties(
366-
${internal_target_name}
367-
PROPERTIES
482+
# GPU builds require special handling for the objects because we want to
483+
# export several different targets at once, e.g. for both Nvidia and AMD.
484+
if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
485+
_build_gpu_entrypoint_objects(
486+
${fq_target_name}
487+
SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
488+
HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
489+
COMPILE_OPTIONS ${common_compile_options}
490+
DEPENDS ${full_deps_list}
368491
CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
369492
FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
370-
)
493+
)
494+
else()
495+
add_library(
496+
${internal_target_name}
497+
# TODO: We don't need an object library for internal consumption.
498+
# A future change should switch this to a normal static library.
499+
EXCLUDE_FROM_ALL
500+
OBJECT
501+
${ADD_ENTRYPOINT_OBJ_SRCS}
502+
${ADD_ENTRYPOINT_OBJ_HDRS}
503+
)
504+
target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
505+
target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
506+
add_dependencies(${internal_target_name} ${full_deps_list})
507+
set_target_properties(
508+
${internal_target_name}
509+
PROPERTIES
510+
CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
511+
FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
512+
)
371513

372-
add_library(
373-
${fq_target_name}
374-
# We want an object library as the objects will eventually get packaged into
375-
# an archive (like libc.a).
376-
EXCLUDE_FROM_ALL
377-
OBJECT
378-
${ADD_ENTRYPOINT_OBJ_SRCS}
379-
${ADD_ENTRYPOINT_OBJ_HDRS}
380-
)
381-
target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING)
382-
target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
383-
add_dependencies(${fq_target_name} ${full_deps_list})
514+
add_library(
515+
${fq_target_name}
516+
# We want an object library as the objects will eventually get packaged into
517+
# an archive (like libc.a).
518+
EXCLUDE_FROM_ALL
519+
OBJECT
520+
${ADD_ENTRYPOINT_OBJ_SRCS}
521+
${ADD_ENTRYPOINT_OBJ_HDRS}
522+
)
523+
target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING)
524+
target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
525+
add_dependencies(${fq_target_name} ${full_deps_list})
384526

385-
set_target_properties(
386-
${fq_target_name}
387-
PROPERTIES
388-
ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
389-
TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
390-
OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
391-
# TODO: We don't need to list internal object files if the internal
392-
# target is a normal static library.
393-
OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
394-
CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
395-
DEPS "${fq_deps_list}"
396-
FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
397-
)
527+
set_target_properties(
528+
${fq_target_name}
529+
PROPERTIES
530+
ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
531+
TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
532+
OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
533+
# TODO: We don't need to list internal object files if the internal
534+
# target is a normal static library.
535+
OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
536+
CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
537+
DEPS "${fq_deps_list}"
538+
FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
539+
)
540+
endif()
398541

399-
if(LLVM_LIBC_ENABLE_LINTING)
542+
if(LLVM_LIBC_ENABLE_LINTING AND TARGET ${internal_target_name})
400543
if(NOT LLVM_LIBC_CLANG_TIDY)
401544
message(FATAL_ERROR "Something is wrong! LLVM_LIBC_ENABLE_LINTING is "
402545
"ON but LLVM_LIBC_CLANG_TIDY is not set.")

libc/cmake/modules/prepare_libc_gpu_build.cmake

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
44
endif()
55

66
# Set up the target architectures to build the GPU libc for.
7-
set(all_gpu_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;"
8-
"sm_70;sm_72;sm_75;sm_80;sm_86;gfx700;gfx701;gfx801;"
9-
"gfx803;gfx900;gfx902;gfx906;gfx908;gfx90a;gfx90c;"
10-
"gfx940;gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;"
11-
"gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;"
12-
"gfx1103")
7+
set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906;"
8+
"gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030;"
9+
"gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;"
10+
"gfx1100;gfx1101;gfx1102;gfx1103")
11+
set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;"
12+
"sm_70;sm_72;sm_75;sm_80;sm_86")
13+
set(all_gpu_architectures
14+
"${all_amdgpu_architectures};${all_nvptx_architectures}")
1315
set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures} CACHE STRING
1416
"List of GPU architectures to build the libc for.")
1517
if(LIBC_GPU_ARCHITECTURES STREQUAL "all")
@@ -29,6 +31,15 @@ if(NOT LLVM_LIBC_FULL_BUILD)
2931
"GPU.")
3032
endif()
3133

34+
# Identify the program used to package multiple images into a single binary.
35+
find_program(LIBC_CLANG_OFFLOAD_PACKAGER
36+
NAMES clang-offload-packager
37+
PATHS ${LLVM_BINARY_DIR}/bin)
38+
if(NOT LIBC_CLANG_OFFLOAD_PACKAGER)
39+
message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU "
40+
"build")
41+
endif()
42+
3243
# Identify any locally installed AMD GPUs on the system to use for testing.
3344
find_program(LIBC_AMDGPU_ARCH
3445
NAMES amdgpu-arch

libc/src/__support/common.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,15 @@
2929
#define LIBC_INLINE inline
3030
#endif
3131

32-
// We use OpenMP to declare these functions on the device.
33-
#define STR(X) #X
34-
#define LLVM_LIBC_DECLARE_DEVICE(name) \
35-
_Pragma(STR(omp declare target to(name) device_type(nohost)))
32+
#if defined(__AMDGPU__) || defined(__NVPTX__)
33+
#define PACKAGE_FOR_GPU
34+
#endif
3635

37-
// GPU targets do not support aliasing and must be declared on the device.
38-
#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(_OPENMP)
36+
// GPU targets do not support aliasing.
37+
#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(PACKAGE_FOR_GPU)
3938
#define LLVM_LIBC_FUNCTION(type, name, arglist) \
4039
LLVM_LIBC_FUNCTION_ATTR decltype(__llvm_libc::name) \
4140
__##name##_impl__ __asm__(#name); \
42-
LLVM_LIBC_DECLARE_DEVICE(__##name##_impl__) \
4341
type __##name##_impl__ arglist
4442
// MacOS needs to be excluded because it does not support aliasing.
4543
#elif defined(LLVM_LIBC_PUBLIC_PACKAGING) && (!defined(__APPLE__))

0 commit comments

Comments
 (0)