[libc] Remove OpenMP and build the GPU libc directly

jhuber6 · jhuber6 · commit 6d0e1373589a · 2023-02-02T09:47:03.000-06:00
The current `libcgpu.a` is actually an archive of fatbinaries. The host file contains nothing but a section called `LLVM_OFFLOADING` that contains embedded device code. This used to be handled implicitly by borrowing the OpenMP toolchain, which did this packaging internally. Passing the OpenMP flags causes problems with trying to move to testing. This patch pulls this logic out into the CMake and handles it manually. This patch is a lot of noise, but it fundamentally comes down to the following changes. 1. Build the source for every GPU architecture (GPU architectures are generally not backwards compatible) 2. Combine all of these files into a single binary blob 3. Embed that binary blob into a host file 4. Package these host files into a `.a` archive. 5. The device code will be extracted and managed by the offloading linker. Another important point. Right now we are maintaining an important distinction with the GPU build. That is, when we build the exported library we will build for many GPU architectures. However, the internal version will only be built for a single GPU architecture, one that was found on the user's system. This is intended to be used for internal testing, very similar to the current path where `libc` is compiled for a single target triple. Reviewed By: sivachandra Differential Revision: https://reviews.llvm.org/D143089
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -52,19 +52,149 @@ function(_get_common_compile_options output_var flags)
     endif()
   endif()
   if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    list(APPEND compile_options "-fopenmp")
-    list(APPEND compile_options "-fopenmp-cuda-mode")
-    foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES})
-      list(APPEND compile_options "--offload-arch=${gpu_arch}")
-    endforeach()
     list(APPEND compile_options "-nogpulib")
-    list(APPEND compile_options "-nogpuinc")
     list(APPEND compile_options "-fvisibility=hidden")
-    list(APPEND compile_options "-foffload-lto")
   endif()
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
 
+# Builds the entrypoint target for the GPU.
+# Usage:
+#     _build_gpu_entrypoint_objects(
+#       <target_name>
+#       SRCS <list of .cpp files>
+#       HDRS <list of .h files>
+#       DEPENDS <list of dependencies>
+#       COMPILE_OPTIONS <optional list of special compile options for this target>
+#       FLAGS <optional list of flags>
+#     )
+function(_build_gpu_entrypoint_objects fq_target_name)
+  cmake_parse_arguments(
+    "ADD_GPU_ENTRYPOINT_OBJ"
+    "" # No optional arguments
+    "NAME;CXX_STANDARD" # Single value arguments
+    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
+    ${ARGN}
+  )
+
+  # The packaged version will be built for every target GPU architecture. We do
+  # this so we can support multiple accelerators on the same machine.
+  foreach(gpu_arch ${all_gpu_architectures})
+    set(gpu_target_name ${fq_target_name}.${gpu_arch})
+    set(compile_options ${ADD_GPU_ENTRYPOINT_OBJ_COMPILE_OPTIONS})
+    # Derive the triple from the specified architecture.
+    if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
+      set(gpu_target_triple "amdgcn-amd-amdhsa")
+      list(APPEND compile_options "-mcpu=${gpu_arch}")
+    elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
+      set(gpu_target_triple "nvptx64-nvidia-cuda")
+      list(APPEND compile_options "-march=${gpu_arch}")
+    else()
+      message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
+    endif()
+    list(APPEND compile_options "--target=${gpu_target_triple}")
+    list(APPEND compile_options "-emit-llvm")
+
+    # Build the library for this target architecture. We always emit LLVM-IR for
+    # packaged GPU binaries.
+    add_library(${gpu_target_name}
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_GPU_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_GPU_ENTRYPOINT_OBJ_HDRS}
+    )
+
+    target_compile_options(${gpu_target_name} PRIVATE ${compile_options})
+    target_include_directories(${gpu_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${gpu_target_name} ${ADD_GPU_ENTRYPOINT_OBJ_DEPENDS})
+    target_compile_definitions(${gpu_target_name} PRIVATE LLVM_LIBC_PUBLIC_PACKAGING)
+
+    # Append this target to a list of images to package into a single binary.
+    set(input_file $<TARGET_OBJECTS:${gpu_target_name}>)
+    list(APPEND packager_images
+         --image=file=${input_file},arch=${gpu_arch},triple=${gpu_target_triple})
+    list(APPEND gpu_target_names ${gpu_target_name})
+  endforeach()
+
+  # After building the target for the desired GPUs we must package the output
+  # into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for
+  # more information.
+  set(packaged_target_name ${fq_target_name}.__gpu__)
+  set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.gpubin)
+
+  add_custom_command(OUTPUT ${packaged_output_name}
+                     COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
+                             ${packager_images} -o ${packaged_output_name}
+                     DEPENDS ${gpu_target_names}
+                     COMMENT "Packaging LLVM offloading binary")
+  add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name})
+
+  # We create an empty 'stub' file for the host to contain the embedded device
+  # code. This will be packaged into 'libcgpu.a'.
+  # TODO: In the future we will want to combine every architecture for a target
+  #       into a single bitcode file and use that. For now we simply build for
+  #       every single one and let the offloading linker handle it.
+  get_filename_component(stub_filename ${ADD_GPU_ENTRYPOINT_OBJ_SRCS} NAME)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${stub_filename} "// Empty file.\n")
+  add_library(
+    ${fq_target_name}
+    # We want an object library as the objects will eventually get packaged into
+    # an archive (like libcgpu.a).
+    EXCLUDE_FROM_ALL
+    OBJECT
+    "${CMAKE_CURRENT_BINARY_DIR}/${stub_filename}"
+  )
+  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options}
+                         -DLLVM_LIBC_PUBLIC_PACKAGING
+                         -nostdlib -Xclang -fembed-offload-object=${packaged_output_name})
+  target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
+  add_dependencies(${fq_target_name} ${full_deps_list} ${packaged_target_name})
+
+  set_target_properties(
+    ${fq_target_name}
+    PROPERTIES
+      ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
+      TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
+      OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
+      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+      DEPS "${fq_deps_list}"
+      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+  )
+
+  # We only build the internal target for a single supported architecture.
+  set(internal_target_name ${fq_target_name}.__internal__)
+  set(include_dirs ${LIBC_BUILD_DIR}/include ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR})
+  if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU OR
+     LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+    add_library(
+      ${internal_target_name}
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${internal_target_name} BEFORE PRIVATE
+                           ${common_compile_options} --target=${LIBC_GPU_TARGET_TRIPLE})
+    if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+      target_compile_options(${internal_target_name} PRIVATE -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE})
+    elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+      target_compile_options(${internal_target_name} PRIVATE -march=${LIBC_GPU_TARGET_ARCHITECTURE})
+    endif()
+    target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${internal_target_name} ${full_deps_list})
+    set_target_properties(
+      ${internal_target_name}
+      PROPERTIES
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
+    set_target_properties(
+      ${fq_target_name}
+      PROPERTIES OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
+    )
+  endif()
+endfunction()
+
 # Rule which is essentially a wrapper over add_library to compile a set of
 # sources to object files.
 # Usage:
@@ -127,7 +257,6 @@ function(create_object_library fq_target_name)
   if(NOT ADD_OBJECT_CXX_STANDARD)
     set(ADD_OBJECT_CXX_STANDARD ${CMAKE_CXX_STANDARD})
   endif()
-  
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
@@ -350,53 +479,67 @@ function(create_entrypoint_object fq_target_name)
     endif()
   endif()
 
-  add_library(
-    ${internal_target_name}
-    # TODO: We don't need an object library for internal consumption.
-    # A future change should switch this to a normal static library.
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_ENTRYPOINT_OBJ_SRCS}
-    ${ADD_ENTRYPOINT_OBJ_HDRS}
-  )
-  target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
-  target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
-  add_dependencies(${internal_target_name} ${full_deps_list})
-  set_target_properties(
-    ${internal_target_name}
-    PROPERTIES
+  # GPU builds require special handling for the objects because we want to
+  # export several different targets at once, e.g. for both Nvidia and AMD.
+  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+    _build_gpu_entrypoint_objects(
+      ${fq_target_name}
+      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
+      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
+      COMPILE_OPTIONS ${common_compile_options}
+      DEPENDS ${full_deps_list}
       CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
       FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-  )
+    )
+  else()
+    add_library(
+      ${internal_target_name}
+      # TODO: We don't need an object library for internal consumption.
+      # A future change should switch this to a normal static library.
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
+    target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${internal_target_name} ${full_deps_list})
+    set_target_properties(
+      ${internal_target_name}
+      PROPERTIES
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
 
-  add_library(
-    ${fq_target_name}
-    # We want an object library as the objects will eventually get packaged into
-    # an archive (like libc.a).
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_ENTRYPOINT_OBJ_SRCS}
-    ${ADD_ENTRYPOINT_OBJ_HDRS}
-  )
-  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING)
-  target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
-  add_dependencies(${fq_target_name} ${full_deps_list})
+    add_library(
+      ${fq_target_name}
+      # We want an object library as the objects will eventually get packaged into
+      # an archive (like libc.a).
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING)
+    target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${fq_target_name} ${full_deps_list})
 
-  set_target_properties(
-    ${fq_target_name}
-    PROPERTIES
-      ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
-      TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
-      OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
-      # TODO: We don't need to list internal object files if the internal
-      # target is a normal static library.
-      OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPS "${fq_deps_list}"
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-  )
+    set_target_properties(
+      ${fq_target_name}
+      PROPERTIES
+        ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
+        TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
+        OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
+        # TODO: We don't need to list internal object files if the internal
+        # target is a normal static library.
+        OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        DEPS "${fq_deps_list}"
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
+  endif()
 
-  if(LLVM_LIBC_ENABLE_LINTING)
+  if(LLVM_LIBC_ENABLE_LINTING AND TARGET ${internal_target_name})
     if(NOT LLVM_LIBC_CLANG_TIDY)
       message(FATAL_ERROR "Something is wrong!  LLVM_LIBC_ENABLE_LINTING is "
               "ON but LLVM_LIBC_CLANG_TIDY is not set.")
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -4,12 +4,14 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
 endif()
 
 # Set up the target architectures to build the GPU libc for.
-set(all_gpu_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;"
-                          "sm_70;sm_72;sm_75;sm_80;sm_86;gfx700;gfx701;gfx801;"
-                          "gfx803;gfx900;gfx902;gfx906;gfx908;gfx90a;gfx90c;"
-                          "gfx940;gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;"
-                          "gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;"
-                          "gfx1103")
+set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906;"
+                             "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030;"
+                             "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;"
+                             "gfx1100;gfx1101;gfx1102;gfx1103")
+set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;"
+                            "sm_70;sm_72;sm_75;sm_80;sm_86")
+set(all_gpu_architectures
+    "${all_amdgpu_architectures};${all_nvptx_architectures}")
 set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures} CACHE STRING
     "List of GPU architectures to build the libc for.")
 if(LIBC_GPU_ARCHITECTURES STREQUAL "all")
@@ -29,6 +31,15 @@ if(NOT LLVM_LIBC_FULL_BUILD)
                       "GPU.")
 endif()
 
+# Identify the program used to package multiple images into a single binary.
+find_program(LIBC_CLANG_OFFLOAD_PACKAGER
+             NAMES clang-offload-packager
+             PATHS ${LLVM_BINARY_DIR}/bin)
+if(NOT LIBC_CLANG_OFFLOAD_PACKAGER)
+  message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU "
+                      "build")
+endif()
+
 # Identify any locally installed AMD GPUs on the system to use for testing.
 find_program(LIBC_AMDGPU_ARCH
              NAMES amdgpu-arch
diff --git a/libc/src/__support/common.h b/libc/src/__support/common.h
@@ -29,17 +29,15 @@
 #define LIBC_INLINE inline
 #endif
 
-// We use OpenMP to declare these functions on the device.
-#define STR(X) #X
-#define LLVM_LIBC_DECLARE_DEVICE(name)                                         \
-  _Pragma(STR(omp declare target to(name) device_type(nohost)))
+#if defined(__AMDGPU__) || defined(__NVPTX__)
+#define PACKAGE_FOR_GPU
+#endif
 
-// GPU targets do not support aliasing and must be declared on the device.
-#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(_OPENMP)
+// GPU targets do not support aliasing.
+#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(PACKAGE_FOR_GPU)
 #define LLVM_LIBC_FUNCTION(type, name, arglist)                                \
   LLVM_LIBC_FUNCTION_ATTR decltype(__llvm_libc::name)                          \
       __##name##_impl__ __asm__(#name);                                        \
-  LLVM_LIBC_DECLARE_DEVICE(__##name##_impl__)                                  \
   type __##name##_impl__ arglist
 // MacOS needs to be excluded because it does not support aliasing.
 #elif defined(LLVM_LIBC_PUBLIC_PACKAGING) && (!defined(__APPLE__))