Skip to content

Commit 12c90bd

Browse files
authored
[LinkerWrapper] Handle AMDGPU Target-IDs correctly when linking (#78359)
Summary: The linker wrapper's job is to sort various embedded inputs into a list of files that participate in a single link job. So far, this has been completely 1-to-1, that is, each input file participates in exactly one link job. However, support for AMD's target-id requires that one input file may participate in multiple link jobs. For example, if given a `gfx90a` static library and a `gfx90a:xnack+` object file input, we should link the gfx90a` target into the `gfx90a:xnack+` job. These are considered separate CPUs that can be mutually linked more or less. This patch adds the necessary logic to make this happen. It primarily reworks the logic to copy relevant input files into a separate list. So, it moves construction of the final list of link jobs into the extraction phase. We also need to copy the files in the case that it is needed more than once, as the entire workflow expects ownership of said file.
1 parent 110e171 commit 12c90bd

File tree

6 files changed

+126
-38
lines changed

6 files changed

+126
-38
lines changed

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8700,7 +8700,7 @@ void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA,
87008700
SmallVector<std::string> Parts{
87018701
"file=" + File.str(),
87028702
"triple=" + TC->getTripleString(),
8703-
"arch=" + getProcessorFromTargetID(TC->getTriple(), Arch).str(),
8703+
"arch=" + Arch.str(),
87048704
"kind=" + Kind.str(),
87058705
};
87068706

clang/test/Driver/amdgpu-openmp-toolchain.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565

6666
// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:sramecc-:xnack+ \
6767
// RUN: -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID
68-
// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a,kind=openmp,feature=-sramecc,feature=+xnack
68+
// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a:sramecc-:xnack+,kind=openmp,feature=-sramecc,feature=+xnack
6969

7070
// RUN: not %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a,gfx90a:xnack+ \
7171
// RUN: -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID-ERROR

clang/test/Driver/linker-wrapper.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
// REQUIRES: nvptx-registered-target
33
// REQUIRES: amdgpu-registered-target
44

5+
// UNSUPPORTED: system-linux
6+
7+
// An externally visible variable so static libraries extract.
8+
__attribute__((visibility("protected"), used)) int x;
9+
510
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o
611
// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.bc
712
// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
@@ -150,3 +155,19 @@
150155
// RUN: --linker-path=/usr/bin/lld-link -- %t.o -libpath:./ -out:a.exe 2>&1 | FileCheck %s --check-prefix=COFF
151156

152157
// COFF: "/usr/bin/lld-link" {{.*}}.o -libpath:./ -out:a.exe {{.*}}openmp.image.wrapper{{.*}}
158+
159+
// RUN: clang-offload-packager -o %t-lib.out \
160+
// RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a
161+
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
162+
// RUN: llvm-ar rcs %t.a %t.o
163+
// RUN: clang-offload-packager -o %t-on.out \
164+
// RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a:xnack+
165+
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-on.o -fembed-offload-object=%t-on.out
166+
// RUN: clang-offload-packager -o %t-off.out \
167+
// RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a:xnack-
168+
// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-off.o -fembed-offload-object=%t-off.out
169+
// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
170+
// RUN: --linker-path=/usr/bin/ld -- %t-on.o %t-off.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMD-TARGET-ID
171+
172+
// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
173+
// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,8 @@ std::unique_ptr<lto::LTO> createLTO(
538538
const ArgList &Args, const std::vector<std::string> &Features,
539539
ModuleHook Hook = [](size_t, const Module &) { return true; }) {
540540
const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
541-
StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
541+
// We need to remove AMD's target-id from the processor if present.
542+
StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
542543
lto::Config Conf;
543544
lto::ThinBackend Backend;
544545
// TODO: Handle index-only thin-LTO
@@ -1066,24 +1067,14 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
10661067

10671068
/// Transforms all the extracted offloading input files into an image that can
10681069
/// be registered by the runtime.
1069-
Expected<SmallVector<StringRef>>
1070-
linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
1071-
const InputArgList &Args, char **Argv, int Argc) {
1070+
Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
1071+
SmallVectorImpl<SmallVector<OffloadFile>> &LinkerInputFiles,
1072+
const InputArgList &Args, char **Argv, int Argc) {
10721073
llvm::TimeTraceScope TimeScope("Handle all device input");
10731074

1074-
DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap;
1075-
for (auto &File : LinkerInputFiles)
1076-
InputMap[File].emplace_back(std::move(File));
1077-
LinkerInputFiles.clear();
1078-
1079-
SmallVector<SmallVector<OffloadFile>> InputsForTarget;
1080-
for (auto &[ID, Input] : InputMap)
1081-
InputsForTarget.emplace_back(std::move(Input));
1082-
InputMap.clear();
1083-
10841075
std::mutex ImageMtx;
10851076
DenseMap<OffloadKind, SmallVector<OffloadingImage>> Images;
1086-
auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error {
1077+
auto Err = parallelForEachError(LinkerInputFiles, [&](auto &Input) -> Error {
10871078
llvm::TimeTraceScope TimeScope("Link device input");
10881079

10891080
// Each thread needs its own copy of the base arguments to maintain
@@ -1359,8 +1350,9 @@ Expected<bool> getSymbols(StringRef Image, OffloadKind Kind, bool IsArchive,
13591350
/// Search the input files and libraries for embedded device offloading code
13601351
/// and add it to the list of files to be linked. Files coming from static
13611352
/// libraries are only added to the input if they are used by an existing
1362-
/// input file.
1363-
Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
1353+
/// input file. Returns a list of input files intended for a single linking job.
1354+
Expected<SmallVector<SmallVector<OffloadFile>>>
1355+
getDeviceInput(const ArgList &Args) {
13641356
llvm::TimeTraceScope TimeScope("ExtractDeviceCode");
13651357

13661358
StringRef Root = Args.getLastArgValue(OPT_sysroot_EQ);
@@ -1372,7 +1364,7 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
13721364
StringSaver Saver(Alloc);
13731365

13741366
// Try to extract device code from the linker input files.
1375-
SmallVector<OffloadFile> InputFiles;
1367+
DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputFiles;
13761368
DenseMap<OffloadFile::TargetID, DenseMap<StringRef, Symbol>> Syms;
13771369
bool WholeArchive = Args.hasArg(OPT_wholearchive_flag) ? true : false;
13781370
for (const opt::Arg *Arg : Args.filtered(
@@ -1416,25 +1408,39 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
14161408
while (Extracted) {
14171409
Extracted = false;
14181410
for (OffloadFile &Binary : Binaries) {
1411+
// If the binary was previously extracted it will be set to null.
14191412
if (!Binary.getBinary())
14201413
continue;
14211414

1422-
// If we don't have an object file for this architecture do not
1423-
// extract.
1424-
if (IsArchive && !WholeArchive && !Syms.count(Binary))
1425-
continue;
1426-
1427-
Expected<bool> ExtractOrErr =
1428-
getSymbols(Binary.getBinary()->getImage(),
1429-
Binary.getBinary()->getOffloadKind(), IsArchive, Saver,
1430-
Syms[Binary]);
1431-
if (!ExtractOrErr)
1432-
return ExtractOrErr.takeError();
1433-
1434-
Extracted = !WholeArchive && *ExtractOrErr;
1435-
1436-
if (!IsArchive || WholeArchive || Extracted)
1437-
InputFiles.emplace_back(std::move(Binary));
1415+
SmallVector<OffloadFile::TargetID> CompatibleTargets = {Binary};
1416+
for (const auto &[ID, Input] : InputFiles)
1417+
if (object::areTargetsCompatible(Binary, ID))
1418+
CompatibleTargets.emplace_back(ID);
1419+
1420+
for (const auto &[Index, ID] : llvm::enumerate(CompatibleTargets)) {
1421+
// Only extract an if we have an an object matching this target.
1422+
if (IsArchive && !WholeArchive && !InputFiles.count(ID))
1423+
continue;
1424+
1425+
Expected<bool> ExtractOrErr = getSymbols(
1426+
Binary.getBinary()->getImage(),
1427+
Binary.getBinary()->getOffloadKind(), IsArchive, Saver, Syms[ID]);
1428+
if (!ExtractOrErr)
1429+
return ExtractOrErr.takeError();
1430+
1431+
Extracted = !WholeArchive && *ExtractOrErr;
1432+
1433+
// Skip including the file if it is an archive that does not resolve
1434+
// any symbols.
1435+
if (IsArchive && !WholeArchive && !Extracted)
1436+
continue;
1437+
1438+
// If another target needs this binary it must be copied instead.
1439+
if (Index == CompatibleTargets.size() - 1)
1440+
InputFiles[ID].emplace_back(std::move(Binary));
1441+
else
1442+
InputFiles[ID].emplace_back(std::move(Binary.copy()));
1443+
}
14381444

14391445
// If we extracted any files we need to check all the symbols again.
14401446
if (Extracted)
@@ -1447,10 +1453,14 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
14471453
auto FileOrErr = getInputBitcodeLibrary(Library);
14481454
if (!FileOrErr)
14491455
return FileOrErr.takeError();
1450-
InputFiles.push_back(std::move(*FileOrErr));
1456+
InputFiles[*FileOrErr].push_back(std::move(*FileOrErr));
14511457
}
14521458

1453-
return std::move(InputFiles);
1459+
SmallVector<SmallVector<OffloadFile>> InputsForTarget;
1460+
for (auto &[ID, Input] : InputFiles)
1461+
InputsForTarget.emplace_back(std::move(Input));
1462+
1463+
return std::move(InputsForTarget);
14541464
}
14551465

14561466
} // namespace

llvm/include/llvm/Object/OffloadBinary.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,19 @@ class OffloadFile : public OwningBinary<OffloadBinary> {
162162
std::unique_ptr<MemoryBuffer> Buffer)
163163
: OwningBinary<OffloadBinary>(std::move(Binary), std::move(Buffer)) {}
164164

165+
/// Make a deep copy of this offloading file.
166+
OffloadFile copy() const {
167+
std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBufferCopy(
168+
getBinary()->getMemoryBufferRef().getBuffer());
169+
170+
// This parsing should never fail because it has already been parsed.
171+
auto NewBinaryOrErr = OffloadBinary::create(*Buffer);
172+
assert(NewBinaryOrErr && "Failed to parse a copy of the binary?");
173+
if (!NewBinaryOrErr)
174+
llvm::consumeError(NewBinaryOrErr.takeError());
175+
return OffloadFile(std::move(*NewBinaryOrErr), std::move(Buffer));
176+
}
177+
165178
/// We use the Triple and Architecture pair to group linker inputs together.
166179
/// This conversion function lets us use these inputs in a hash-map.
167180
operator TargetID() const {
@@ -186,6 +199,18 @@ OffloadKind getOffloadKind(StringRef Name);
186199
/// Convert an offload kind to its string representation.
187200
StringRef getOffloadKindName(OffloadKind Name);
188201

202+
/// If the target is AMD we check the target IDs for mutual compatibility. A
203+
/// target id is a string conforming to the folowing BNF syntax:
204+
///
205+
/// target-id ::= '<arch> ( : <feature> ( '+' | '-' ) )*'
206+
///
207+
/// The features 'xnack' and 'sramecc' are currently supported. These can be in
208+
/// the state of on, off, and any when unspecified. A target marked as any can
209+
/// bind with either on or off. This is used to link mutually compatible
210+
/// architectures together. Returns false in the case of an exact match.
211+
bool areTargetsCompatible(const OffloadFile::TargetID &LHS,
212+
const OffloadFile::TargetID &RHS);
213+
189214
} // namespace object
190215

191216
} // namespace llvm

llvm/lib/Object/OffloadBinary.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,3 +343,35 @@ StringRef object::getImageKindName(ImageKind Kind) {
343343
return "";
344344
}
345345
}
346+
347+
bool object::areTargetsCompatible(const OffloadFile::TargetID &LHS,
348+
const OffloadFile::TargetID &RHS) {
349+
// Exact matches are not considered compatible because they are the same
350+
// target. We are interested in different targets that are compatible.
351+
if (LHS == RHS)
352+
return false;
353+
354+
// The triples must match at all times.
355+
if (LHS.first != RHS.first)
356+
return false;
357+
358+
// Only The AMDGPU target requires additional checks.
359+
llvm::Triple T(LHS.first);
360+
if (!T.isAMDGPU())
361+
return false;
362+
363+
// The base processor must always match.
364+
if (LHS.second.split(":").first != RHS.second.split(":").first)
365+
return false;
366+
367+
// Check combintions of on / off features that must match.
368+
if (LHS.second.contains("xnack+") && RHS.second.contains("xnack-"))
369+
return false;
370+
if (LHS.second.contains("xnack-") && RHS.second.contains("xnack+"))
371+
return false;
372+
if (LHS.second.contains("sramecc-") && RHS.second.contains("sramecc+"))
373+
return false;
374+
if (LHS.second.contains("sramecc+") && RHS.second.contains("sramecc-"))
375+
return false;
376+
return true;
377+
}

0 commit comments

Comments
 (0)