Skip to content

Commit dc6a0b0

Browse files
committed
[HIP] Align device binary
To facilitate faster loading of device binaries and share them among processes, HIP runtime favors their alignment being 4096 bytes. HIP runtime can load unaligned device binaries, however, aligning them at 4096 bytes results in faster loading and less shared memory usage. This patch adds an option -bundle-align to clang-offload-bundler which allows bundles to be aligned at specified alignment. By default it is 1, which is NFC compared to existing format. This patch then aligns embedded fat binary and device binary inside fat binary at 4096 bytes. It has been verified this change does not cause significant overall file size increase for typical HIP applications (less than 1%). Differential Revision: https://reviews.llvm.org/D88734
1 parent 04fce15 commit dc6a0b0

File tree

7 files changed

+41
-4
lines changed

7 files changed

+41
-4
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -597,8 +597,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
597597
if (CudaGpuBinary) {
598598
// If fatbin is available from early finalization, create a string
599599
// literal containing the fat binary loaded from the given file.
600-
FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()),
601-
"", FatbinConstantName, 8);
600+
const unsigned HIPCodeObjectAlign = 4096;
601+
FatBinStr =
602+
makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
603+
FatbinConstantName, HIPCodeObjectAlign);
602604
} else {
603605
// If fatbin is not available, create an external symbol
604606
// __hip_fatbin in section .hip_fatbin. The external symbol is supposed

clang/lib/Driver/ToolChains/HIP.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "clang/Driver/Driver.h"
1717
#include "clang/Driver/DriverDiagnostic.h"
1818
#include "clang/Driver/Options.h"
19+
#include "llvm/Support/Alignment.h"
1920
#include "llvm/Support/FileSystem.h"
2021
#include "llvm/Support/Path.h"
2122
#include "llvm/Support/TargetParser.h"
@@ -33,6 +34,7 @@ using namespace llvm::opt;
3334
#endif
3435

3536
namespace {
37+
const unsigned HIPCodeObjectAlign = 4096;
3638

3739
static void addBCLib(const Driver &D, const ArgList &Args,
3840
ArgStringList &CmdArgs, ArgStringList LibraryPaths,
@@ -108,6 +110,8 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
108110
// for different GPU archs.
109111
ArgStringList BundlerArgs;
110112
BundlerArgs.push_back(Args.MakeArgString("-type=o"));
113+
BundlerArgs.push_back(
114+
Args.MakeArgString("-bundle-align=" + Twine(HIPCodeObjectAlign)));
111115

112116
// ToDo: Remove the dummy host binary entry which is required by
113117
// clang-offload-bundler.
@@ -175,7 +179,8 @@ void AMDGCN::Linker::constructGenerateObjFileFromHIPFatBinary(
175179
ObjStream << " .section .hip_fatbin,\"aMS\",@progbits,1\n";
176180
ObjStream << " .data\n";
177181
ObjStream << " .globl __hip_fatbin\n";
178-
ObjStream << " .p2align 3\n";
182+
ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
183+
<< "\n";
179184
ObjStream << "__hip_fatbin:\n";
180185
ObjStream << " .incbin \"" << BundleFile << "\"\n";
181186
ObjStream.flush();

clang/test/CodeGenCUDA/device-stub.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ void use_pointers() {
115115
// ALL: @4 = private unnamed_addr constant [21 x i8] c"ext_constant_var_def\00"
116116
// * constant unnamed string with GPU binary
117117
// CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
118-
// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
118+
// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",{{.*}}align 4096
119119
// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
120120
// CUDANORDC-SAME: section ".nv_fatbin", align 8
121121
// CUDARDC-SAME: section "__nv_relfatbin", align 8

clang/test/Driver/clang-offload-bundler.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,16 @@
278278
// RUN: diff %t.empty %t.res.tgt1
279279
// RUN: diff %t.empty %t.res.tgt2
280280

281+
//
282+
// Check -bundle-align option
283+
//
284+
285+
// RUN: clang-offload-bundler -bundle-align=4096 -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -inputs=%t.bc,%t.tgt1,%t.tgt2 -outputs=%t.bundle3.bc
286+
// RUN: clang-offload-bundler -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -outputs=%t.res.bc,%t.res.tgt1,%t.res.tgt2 -inputs=%t.bundle3.bc -unbundle
287+
// RUN: diff %t.bc %t.res.bc
288+
// RUN: diff %t.tgt1 %t.res.tgt1
289+
// RUN: diff %t.tgt2 %t.res.tgt2
290+
281291
// Some code so that we can create a binary out of this file.
282292
int A = 0;
283293
void test_func(void) {

clang/test/Driver/hip-toolchain-no-rdc.hip

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
//
8282

8383
// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
84+
// CHECK-SAME: "-bundle-align=4096"
8485
// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
8586
// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
8687

@@ -143,6 +144,7 @@
143144
//
144145

145146
// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
147+
// CHECK-SAME: "-bundle-align=4096"
146148
// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
147149
// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"
148150

clang/test/Driver/hip-toolchain-rdc.hip

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@
88
// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
99
// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
1010
// RUN: -fuse-ld=lld -fgpu-rdc -nogpuinc \
11+
// RUN: -fhip-dump-offload-linker-script \
1112
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
1213
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
1314
// RUN: 2>&1 | FileCheck %s
1415

16+
// check code object alignment in dumped llvm-mc input
17+
// CHECK: .p2align 12
18+
1519
// emit objects for host side path
1620
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
1721
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
@@ -87,6 +91,7 @@
8791

8892
// combine images generated into hip fat binary object
8993
// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
94+
// CHECK-SAME: "-bundle-align=4096"
9095
// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
9196
// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
9297

clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,11 @@ static cl::opt<bool> PrintExternalCommands(
9494
"instead of actually executing them - for testing purposes.\n"),
9595
cl::init(false), cl::cat(ClangOffloadBundlerCategory));
9696

97+
static cl::opt<unsigned>
98+
BundleAlignment("bundle-align",
99+
cl::desc("Alignment of bundle for binary files"),
100+
cl::init(1), cl::cat(ClangOffloadBundlerCategory));
101+
97102
/// Magic string that marks the existence of offloading data.
98103
#define OFFLOAD_BUNDLER_MAGIC_STR "__CLANG_OFFLOAD_BUNDLE__"
99104

@@ -223,6 +228,9 @@ class BinaryFileHandler final : public FileHandler {
223228
StringMap<BundleInfo>::iterator CurBundleInfo;
224229
StringMap<BundleInfo>::iterator NextBundleInfo;
225230

231+
/// Current bundle target to be written.
232+
std::string CurWriteBundleTarget;
233+
226234
public:
227235
BinaryFileHandler() : FileHandler() {}
228236

@@ -337,10 +345,12 @@ class BinaryFileHandler final : public FileHandler {
337345
unsigned Idx = 0;
338346
for (auto &T : TargetNames) {
339347
MemoryBuffer &MB = *Inputs[Idx++];
348+
HeaderSize = alignTo(HeaderSize, BundleAlignment);
340349
// Bundle offset.
341350
Write8byteIntegerToBuffer(OS, HeaderSize);
342351
// Size of the bundle (adds to the next bundle's offset)
343352
Write8byteIntegerToBuffer(OS, MB.getBufferSize());
353+
BundlesInfo[T] = BundleInfo(MB.getBufferSize(), HeaderSize);
344354
HeaderSize += MB.getBufferSize();
345355
// Size of the triple
346356
Write8byteIntegerToBuffer(OS, T.size());
@@ -351,6 +361,7 @@ class BinaryFileHandler final : public FileHandler {
351361
}
352362

353363
Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final {
364+
CurWriteBundleTarget = TargetTriple.str();
354365
return Error::success();
355366
}
356367

@@ -359,6 +370,8 @@ class BinaryFileHandler final : public FileHandler {
359370
}
360371

361372
Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final {
373+
auto BI = BundlesInfo[CurWriteBundleTarget];
374+
OS.seek(BI.Offset);
362375
OS.write(Input.getBufferStart(), Input.getBufferSize());
363376
return Error::success();
364377
}

0 commit comments

Comments
 (0)