llvm · jhuber6 · Nov 15, 2023 · arsenm · Nov 17, 2023
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -8692,12 +8692,10 @@ void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA,
       }
     }
 
-    // TODO: We need to pass in the full target-id and handle it properly in the
-    // linker wrapper.
     SmallVector<std::string> Parts{
         "file=" + File.str(),
         "triple=" + TC->getTripleString(),
-        "arch=" + getProcessorFromTargetID(TC->getTriple(), Arch).str(),
+        "arch=" + Arch.str(),
         "kind=" + Kind.str(),
     };
 

diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -65,7 +65,7 @@
 
 // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:sramecc-:xnack+ \
 // RUN:   -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID
-// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a,kind=openmp,feature=-sramecc,feature=+xnack
+// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a:sramecc-:xnack+,kind=openmp,feature=-sramecc,feature=+xnack
 
 // RUN: not %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a,gfx90a:xnack+ \
 // RUN:   -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID-ERROR

diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
@@ -2,6 +2,9 @@
 // REQUIRES: nvptx-registered-target
 // REQUIRES: amdgpu-registered-target
 
+// An externally visible variable so static libraries extract.
+__attribute__((visibility("protected"), used)) int x;
+
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o
 // RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.bc
 // RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
@@ -36,6 +39,18 @@
 
 // AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
+// RUN: clang-offload-packager -o %t-lib.out \
+// RUN:   --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
+// RUN: llvm-ar rcs %t.a %t.o
+// RUN: clang-offload-packager -o %t.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a:xnack+
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
+// RUN:   --linker-path=/usr/bin/ld -- %t.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK-ID
+
+// AMDGPU-LINK-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \
 // RUN:   --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1009,9 +1009,13 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
   for (Arg *A : Args)
     DAL.append(A);
 
-  // Set the subarchitecture and target triple for this compilation.
+  // Set the subarchitecture and target triple for this compilation. The input
+  // may be an AMDGPU target-id so we split off anything before the colon.
   const OptTable &Tbl = getOptTable();
   DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_arch_EQ),
+                   Args.MakeArgString(
+                       Input.front().getBinary()->getArch().split(':').first));
+  DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_full_arch_EQ),
                    Args.MakeArgString(Input.front().getBinary()->getArch()));
   DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_triple_EQ),
                    Args.MakeArgString(Input.front().getBinary()->getTriple()));
@@ -1041,23 +1045,13 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
 /// Transforms all the extracted offloading input files into an image that can
 /// be registered by the runtime.
 Expected<SmallVector<StringRef>>
-linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
+linkAndWrapDeviceFiles(SmallVector<SmallVector<OffloadFile>> &LinkerInputFiles,
                        const InputArgList &Args, char **Argv, int Argc) {
   llvm::TimeTraceScope TimeScope("Handle all device input");
 
-  DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap;
-  for (auto &File : LinkerInputFiles)
-    InputMap[File].emplace_back(std::move(File));
-  LinkerInputFiles.clear();
-
-  SmallVector<SmallVector<OffloadFile>> InputsForTarget;
-  for (auto &[ID, Input] : InputMap)
-    InputsForTarget.emplace_back(std::move(Input));
-  InputMap.clear();
-
   std::mutex ImageMtx;
   DenseMap<OffloadKind, SmallVector<OffloadingImage>> Images;
-  auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error {
+  auto Err = parallelForEachError(LinkerInputFiles, [&](auto &Input) -> Error {
     llvm::TimeTraceScope TimeScope("Link device input");
 
     // Each thread needs its own copy of the base arguments to maintain
@@ -1115,7 +1109,7 @@ linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
       TheImage.StringData["triple"] =
           Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ));
       TheImage.StringData["arch"] =
-          Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ));
+          Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_full_arch_EQ));
       TheImage.Image = std::move(*FileOrErr);
 
       Images[Kind].emplace_back(std::move(TheImage));
@@ -1334,7 +1328,8 @@ Expected<bool> getSymbols(StringRef Image, OffloadKind Kind, bool IsArchive,
 /// and add it to the list of files to be linked. Files coming from static
 /// libraries are only added to the input if they are used by an existing
 /// input file.
-Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
+Expected<SmallVector<SmallVector<OffloadFile>>>
+getDeviceInput(const ArgList &Args) {
   llvm::TimeTraceScope TimeScope("ExtractDeviceCode");
 
   StringRef Root = Args.getLastArgValue(OPT_sysroot_EQ);
@@ -1346,7 +1341,7 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
   StringSaver Saver(Alloc);
 
   // Try to extract device code from the linker input files.
-  SmallVector<OffloadFile> InputFiles;
+  DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap;
   DenseMap<OffloadFile::TargetID, DenseMap<StringRef, Symbol>> Syms;
   bool WholeArchive = false;
   for (const opt::Arg *Arg : Args.filtered(
@@ -1393,36 +1388,60 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
         if (!Binary.getBinary())
           continue;
 
-        // If we don't have an object file for this architecture do not
-        // extract.
-        if (IsArchive && !WholeArchive && !Syms.count(Binary))
-          continue;
-
-        Expected<bool> ExtractOrErr =
-            getSymbols(Binary.getBinary()->getImage(),
-                       Binary.getBinary()->getOffloadKind(), IsArchive, Saver,
-                       Syms[Binary]);
-        if (!ExtractOrErr)
-          return ExtractOrErr.takeError();
-
-        Extracted = !WholeArchive && *ExtractOrErr;
+        // Initialize the map with an empty set of inputs.
+        OffloadFile::TargetID BinaryID =
+            OffloadFile::TargetID(Saver.save(Binary.getBinary()->getTriple()),
+                                  Saver.save(Binary.getBinary()->getArch()));
+        if (!InputMap.count(BinaryID))
+          InputMap[BinaryID] = SmallVector<OffloadFile>();
+
+        // We need to compare this binary input with every input architecture
+        // and copy it in if it's compatible. This allows a single binary to
+        // participate in multiple link jobs.
+        DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> NewInputMap;
+        for (const auto &[ID, Input] : InputMap) {
+          // If we don't have an object file for this architecture do not
+          // extract.
+          if (IsArchive && !WholeArchive && Input.empty())
+            continue;
+
+          // We only add the input if the binary is compatible with the slot.
+          if (!areTargetsCompatible(Binary, ID))
+            continue;
+
+          Expected<bool> ExtractOrErr = getSymbols(
+              Binary.getBinary()->getImage(),
+              Binary.getBinary()->getOffloadKind(), IsArchive, Saver, Syms[ID]);
+          if (!ExtractOrErr)
+            return ExtractOrErr.takeError();
+
+          Extracted = !WholeArchive && *ExtractOrErr;
+
+          if (!IsArchive || WholeArchive || Extracted) {
+            auto NewBinaryOrErr = Binary.copy();
+            if (!NewBinaryOrErr)
+              return NewBinaryOrErr.takeError();
+            NewInputMap[ID].emplace_back(std::move(*NewBinaryOrErr));
+          }
+        }
 
-        if (!IsArchive || WholeArchive || Extracted)
-          InputFiles.emplace_back(std::move(Binary));
+        for (auto &[NewID, NewInput] : NewInputMap)
+          InputMap[NewID].append(std::make_move_iterator(NewInput.begin()),
+                                 std::make_move_iterator(NewInput.end()));
 
+        Binary.takeBinary();
         // If we extracted any files we need to check all the symbols again.
         if (Extracted)
           break;
       }
     }
   }
 
-  for (StringRef Library : Args.getAllArgValues(OPT_bitcode_library_EQ)) {
-    auto FileOrErr = getInputBitcodeLibrary(Library);
-    if (!FileOrErr)
-      return FileOrErr.takeError();
-    InputFiles.push_back(std::move(*FileOrErr));
-  }
+  SmallVector<SmallVector<OffloadFile>> InputFiles;
+  for (auto &[ID, Input] : InputMap)
+    if (!Input.empty())
+      InputFiles.emplace_back(std::move(Input));
+  InputMap.clear();
 
   return std::move(InputFiles);
 }

diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -74,6 +74,9 @@ def wrapper_jobs : Joined<["--"], "wrapper-jobs=">,
 def arch_EQ : Joined<["--"], "arch=">,
   Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<arch>">,
   HelpText<"The device subarchitecture">;
+def full_arch_EQ : Joined<["--"], "full-arch=">,
+  Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<arch>">,
+  HelpText<"The fully qualifier device subarchitecture for AMD's target ID">;
 def triple_EQ : Joined<["--"], "triple=">,
   Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<triple>">,
   HelpText<"The device target triple">;

diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h
@@ -17,6 +17,7 @@
 #ifndef LLVM_OBJECT_OFFLOADBINARY_H
 #define LLVM_OBJECT_OFFLOADBINARY_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -156,19 +157,51 @@ class OffloadBinary : public Binary {
 /// owns its memory.
 class OffloadFile : public OwningBinary<OffloadBinary> {
 public:
+  /// An ordered pair of the target triple and the architecture.
   using TargetID = std::pair<StringRef, StringRef>;
 
   OffloadFile(std::unique_ptr<OffloadBinary> Binary,
               std::unique_ptr<MemoryBuffer> Buffer)
       : OwningBinary<OffloadBinary>(std::move(Binary), std::move(Buffer)) {}
 
+  Expected<OffloadFile> copy() const {
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBufferCopy(
+        getBinary()->getMemoryBufferRef().getBuffer());
+    auto NewBinaryOrErr = OffloadBinary::create(*Buffer);
+    if (!NewBinaryOrErr)
+      return NewBinaryOrErr.takeError();
+    return OffloadFile(std::move(*NewBinaryOrErr), std::move(Buffer));
+  }
+
   /// We use the Triple and Architecture pair to group linker inputs together.
   /// This conversion function lets us use these inputs in a hash-map.
   operator TargetID() const {
     return std::make_pair(getBinary()->getTriple(), getBinary()->getArch());
   }
 };
 
+/// Queries if the target \p LHS is compatible with \p RHS for linking purposes.
+inline bool areTargetsCompatible(const OffloadFile::TargetID LHS,
+                                 const OffloadFile::TargetID RHS) {
+  if (LHS == RHS)
+    return true;
+
+  // If the target is AMD we check the target IDs for compatibility. A target id
+  // is a string conforming to the folowing BNF syntax:
+  //
+  //  target-id ::= '<arch> ( : <feature> ( '+' | '-' ) )*'
+  //
+  // This is used to link mutually compatible architectures together.
+  llvm::Triple T(LHS.first);
+  if (!T.isAMDGPU())
+    return false;
+
+  // The targets are compatible if the architecture is a subset of the other.
+  if (RHS.second.contains(LHS.second))
+    return true;
+  return false;
+}
+
 /// Extracts embedded device offloading code from a memory \p Buffer to a list
 /// of \p Binaries.
 Error extractOffloadBinaries(MemoryBufferRef Buffer,