Skip to content

[LinkerWrapper] Support device binaries in multiple link jobs #72442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8692,12 +8692,10 @@ void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA,
}
}

// TODO: We need to pass in the full target-id and handle it properly in the
// linker wrapper.
SmallVector<std::string> Parts{
"file=" + File.str(),
"triple=" + TC->getTripleString(),
"arch=" + getProcessorFromTargetID(TC->getTriple(), Arch).str(),
"arch=" + Arch.str(),
"kind=" + Kind.str(),
};

Expand Down
2 changes: 1 addition & 1 deletion clang/test/Driver/amdgpu-openmp-toolchain.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@

// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:sramecc-:xnack+ \
// RUN: -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID
// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a,kind=openmp,feature=-sramecc,feature=+xnack
// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a:sramecc-:xnack+,kind=openmp,feature=-sramecc,feature=+xnack

// RUN: not %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a,gfx90a:xnack+ \
// RUN: -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID-ERROR
Expand Down
15 changes: 15 additions & 0 deletions clang/test/Driver/linker-wrapper.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
// REQUIRES: nvptx-registered-target
// REQUIRES: amdgpu-registered-target

// An externally visible variable so static libraries extract.
__attribute__((visibility("protected"), used)) int x;

// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o
// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.bc
// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
Expand Down Expand Up @@ -36,6 +39,18 @@

// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o

// RUN: clang-offload-packager -o %t-lib.out \
// RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
// RUN: llvm-ar rcs %t.a %t.o
// RUN: clang-offload-packager -o %t.out \
// RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a:xnack+
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
// RUN: --linker-path=/usr/bin/ld -- %t.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK-ID

// AMDGPU-LINK-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o

// RUN: clang-offload-packager -o %t.out \
// RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \
// RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
Expand Down
93 changes: 56 additions & 37 deletions clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1009,9 +1009,13 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
for (Arg *A : Args)
DAL.append(A);

// Set the subarchitecture and target triple for this compilation.
// Set the subarchitecture and target triple for this compilation. The input
// may be an AMDGPU target-id so we split off anything before the colon.
const OptTable &Tbl = getOptTable();
DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_arch_EQ),
Args.MakeArgString(
Input.front().getBinary()->getArch().split(':').first));
DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_full_arch_EQ),
Args.MakeArgString(Input.front().getBinary()->getArch()));
DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_triple_EQ),
Args.MakeArgString(Input.front().getBinary()->getTriple()));
Expand Down Expand Up @@ -1041,23 +1045,13 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
/// Transforms all the extracted offloading input files into an image that can
/// be registered by the runtime.
Expected<SmallVector<StringRef>>
linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
linkAndWrapDeviceFiles(SmallVector<SmallVector<OffloadFile>> &LinkerInputFiles,
const InputArgList &Args, char **Argv, int Argc) {
llvm::TimeTraceScope TimeScope("Handle all device input");

DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap;
for (auto &File : LinkerInputFiles)
InputMap[File].emplace_back(std::move(File));
LinkerInputFiles.clear();

SmallVector<SmallVector<OffloadFile>> InputsForTarget;
for (auto &[ID, Input] : InputMap)
InputsForTarget.emplace_back(std::move(Input));
InputMap.clear();

std::mutex ImageMtx;
DenseMap<OffloadKind, SmallVector<OffloadingImage>> Images;
auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error {
auto Err = parallelForEachError(LinkerInputFiles, [&](auto &Input) -> Error {
llvm::TimeTraceScope TimeScope("Link device input");

// Each thread needs its own copy of the base arguments to maintain
Expand Down Expand Up @@ -1115,7 +1109,7 @@ linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
TheImage.StringData["triple"] =
Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ));
TheImage.StringData["arch"] =
Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ));
Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_full_arch_EQ));
TheImage.Image = std::move(*FileOrErr);

Images[Kind].emplace_back(std::move(TheImage));
Expand Down Expand Up @@ -1334,7 +1328,8 @@ Expected<bool> getSymbols(StringRef Image, OffloadKind Kind, bool IsArchive,
/// and add it to the list of files to be linked. Files coming from static
/// libraries are only added to the input if they are used by an existing
/// input file.
Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
Expected<SmallVector<SmallVector<OffloadFile>>>
getDeviceInput(const ArgList &Args) {
llvm::TimeTraceScope TimeScope("ExtractDeviceCode");

StringRef Root = Args.getLastArgValue(OPT_sysroot_EQ);
Expand All @@ -1346,7 +1341,7 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
StringSaver Saver(Alloc);

// Try to extract device code from the linker input files.
SmallVector<OffloadFile> InputFiles;
DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap;
DenseMap<OffloadFile::TargetID, DenseMap<StringRef, Symbol>> Syms;
bool WholeArchive = false;
for (const opt::Arg *Arg : Args.filtered(
Expand Down Expand Up @@ -1393,36 +1388,60 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
if (!Binary.getBinary())
continue;

// If we don't have an object file for this architecture do not
// extract.
if (IsArchive && !WholeArchive && !Syms.count(Binary))
continue;

Expected<bool> ExtractOrErr =
getSymbols(Binary.getBinary()->getImage(),
Binary.getBinary()->getOffloadKind(), IsArchive, Saver,
Syms[Binary]);
if (!ExtractOrErr)
return ExtractOrErr.takeError();

Extracted = !WholeArchive && *ExtractOrErr;
// Initialize the map with an empty set of inputs.
OffloadFile::TargetID BinaryID =
OffloadFile::TargetID(Saver.save(Binary.getBinary()->getTriple()),
Saver.save(Binary.getBinary()->getArch()));
if (!InputMap.count(BinaryID))
InputMap[BinaryID] = SmallVector<OffloadFile>();

// We need to compare this binary input with every input architecture
// and copy it in if it's compatible. This allows a single binary to
// participate in multiple link jobs.
DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> NewInputMap;
for (const auto &[ID, Input] : InputMap) {
// If we don't have an object file for this architecture do not
// extract.
if (IsArchive && !WholeArchive && Input.empty())
continue;

// We only add the input if the binary is compatible with the slot.
if (!areTargetsCompatible(Binary, ID))
continue;

Expected<bool> ExtractOrErr = getSymbols(
Binary.getBinary()->getImage(),
Binary.getBinary()->getOffloadKind(), IsArchive, Saver, Syms[ID]);
if (!ExtractOrErr)
return ExtractOrErr.takeError();

Extracted = !WholeArchive && *ExtractOrErr;

if (!IsArchive || WholeArchive || Extracted) {
auto NewBinaryOrErr = Binary.copy();
if (!NewBinaryOrErr)
return NewBinaryOrErr.takeError();
NewInputMap[ID].emplace_back(std::move(*NewBinaryOrErr));
}
}

if (!IsArchive || WholeArchive || Extracted)
InputFiles.emplace_back(std::move(Binary));
for (auto &[NewID, NewInput] : NewInputMap)
InputMap[NewID].append(std::make_move_iterator(NewInput.begin()),
std::make_move_iterator(NewInput.end()));

Binary.takeBinary();
// If we extracted any files we need to check all the symbols again.
if (Extracted)
break;
}
}
}

for (StringRef Library : Args.getAllArgValues(OPT_bitcode_library_EQ)) {
auto FileOrErr = getInputBitcodeLibrary(Library);
if (!FileOrErr)
return FileOrErr.takeError();
InputFiles.push_back(std::move(*FileOrErr));
}
SmallVector<SmallVector<OffloadFile>> InputFiles;
for (auto &[ID, Input] : InputMap)
if (!Input.empty())
InputFiles.emplace_back(std::move(Input));
InputMap.clear();

return std::move(InputFiles);
}
Expand Down
3 changes: 3 additions & 0 deletions clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def wrapper_jobs : Joined<["--"], "wrapper-jobs=">,
def arch_EQ : Joined<["--"], "arch=">,
Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<arch>">,
HelpText<"The device subarchitecture">;
def full_arch_EQ : Joined<["--"], "full-arch=">,
Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<arch>">,
HelpText<"The fully qualifier device subarchitecture for AMD's target ID">;
def triple_EQ : Joined<["--"], "triple=">,
Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<triple>">,
HelpText<"The device target triple">;
Expand Down
33 changes: 33 additions & 0 deletions llvm/include/llvm/Object/OffloadBinary.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#ifndef LLVM_OBJECT_OFFLOADBINARY_H
#define LLVM_OBJECT_OFFLOADBINARY_H

#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
Expand Down Expand Up @@ -156,19 +157,51 @@ class OffloadBinary : public Binary {
/// owns its memory.
class OffloadFile : public OwningBinary<OffloadBinary> {
public:
/// An ordered pair of the target triple and the architecture.
using TargetID = std::pair<StringRef, StringRef>;

OffloadFile(std::unique_ptr<OffloadBinary> Binary,
std::unique_ptr<MemoryBuffer> Buffer)
: OwningBinary<OffloadBinary>(std::move(Binary), std::move(Buffer)) {}

Expected<OffloadFile> copy() const {
std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBufferCopy(
getBinary()->getMemoryBufferRef().getBuffer());
auto NewBinaryOrErr = OffloadBinary::create(*Buffer);
if (!NewBinaryOrErr)
return NewBinaryOrErr.takeError();
return OffloadFile(std::move(*NewBinaryOrErr), std::move(Buffer));
}

/// We use the Triple and Architecture pair to group linker inputs together.
/// This conversion function lets us use these inputs in a hash-map.
operator TargetID() const {
return std::make_pair(getBinary()->getTriple(), getBinary()->getArch());
}
};

/// Queries if the target \p LHS is compatible with \p RHS for linking purposes.
inline bool areTargetsCompatible(const OffloadFile::TargetID LHS,
const OffloadFile::TargetID RHS) {
if (LHS == RHS)
return true;

// If the target is AMD we check the target IDs for compatibility. A target id
// is a string conforming to the folowing BNF syntax:
//
// target-id ::= '<arch> ( : <feature> ( '+' | '-' ) )*'
//
// This is used to link mutually compatible architectures together.
llvm::Triple T(LHS.first);
if (!T.isAMDGPU())
return false;

// The targets are compatible if the architecture is a subset of the other.
if (RHS.second.contains(LHS.second))
return true;
return false;
Comment on lines +200 to +202
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return RHS.second.contains

}

/// Extracts embedded device offloading code from a memory \p Buffer to a list
/// of \p Binaries.
Error extractOffloadBinaries(MemoryBufferRef Buffer,
Expand Down