Skip to content

Commit e58e1b4

Browse files
committed
[clang][AMDGPU] Enable module splitting by default
The default number of partitions is the number of cores on the machine with a cap at 16, as going above 16 is unlikely to be useful in the common case. Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting. Fixes SWDEV-506214
1 parent 61fb954 commit e58e1b4

File tree

8 files changed

+98
-6
lines changed

8 files changed

+98
-6
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1393,6 +1393,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">,
13931393
HelpText<"Compile HIP source to relocatable">;
13941394
def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
13951395
HelpText<"Do not override toolchain to compile HIP source to relocatable">;
1396+
def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>,
1397+
HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">;
13961398
}
13971399

13981400
// Clang specific/exclusive options for OpenACC.
@@ -3158,7 +3160,7 @@ def modules_reduced_bmi : Flag<["-"], "fmodules-reduced-bmi">,
31583160
HelpText<"Generate the reduced BMI">,
31593161
MarshallingInfoFlag<FrontendOpts<"GenReducedBMI">>;
31603162

3161-
def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">,
3163+
def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">,
31623164
Group<f_Group>, Visibility<[ClangOption, CC1Option]>, Alias<modules_reduced_bmi>;
31633165

31643166
def fmodules_embed_all_files : Joined<["-"], "fmodules-embed-all-files">,
@@ -7417,7 +7419,7 @@ def fuse_register_sized_bitfield_access: Flag<["-"], "fuse-register-sized-bitfie
74177419
def relaxed_aliasing : Flag<["-"], "relaxed-aliasing">,
74187420
HelpText<"Turn off Type Based Alias Analysis">,
74197421
MarshallingInfoFlag<CodeGenOpts<"RelaxedAliasing">>;
7420-
defm pointer_tbaa: BoolOption<"", "pointer-tbaa", CodeGenOpts<"PointerTBAA">,
7422+
defm pointer_tbaa: BoolOption<"", "pointer-tbaa", CodeGenOpts<"PointerTBAA">,
74217423
DefaultTrue,
74227424
PosFlag<SetTrue, [], [ClangOption], "Enable">,
74237425
NegFlag<SetFalse, [], [ClangOption], "Disable">,

clang/lib/Driver/ToolChains/AMDGPU.cpp

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "llvm/Support/LineIterator.h"
2222
#include "llvm/Support/Path.h"
2323
#include "llvm/Support/Process.h"
24+
#include "llvm/Support/Threading.h"
2425
#include "llvm/Support/VirtualFileSystem.h"
2526
#include "llvm/TargetParser/Host.h"
2627
#include <optional>
@@ -630,8 +631,11 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
630631
getToolChain().AddFilePathLibArgs(Args, CmdArgs);
631632
AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
632633
if (C.getDriver().isUsingLTO()) {
633-
addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
634-
C.getDriver().getLTOMode() == LTOK_Thin);
634+
const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin);
635+
addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO);
636+
637+
if (!ThinLTO)
638+
addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs);
635639
} else if (Args.hasArg(options::OPT_mcpu_EQ)) {
636640
CmdArgs.push_back(Args.MakeArgString(
637641
"-plugin-opt=mcpu=" +
@@ -708,6 +712,34 @@ void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
708712
options::OPT_m_amdgpu_Features_Group);
709713
}
710714

715+
static unsigned GetFullLTOPartitions(const Driver &D, const ArgList &Args) {
716+
const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ);
717+
// In the absence of an option, use the number of available threads with a cap
718+
// at 16 partitions. More than 16 partitions rarely benefits code splitting
719+
// and can lead to more empty/small modules each with their own overhead.
720+
if (!A)
721+
return std::max(16u, llvm::hardware_concurrency().compute_thread_count());
722+
int Value;
723+
if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) {
724+
D.Diag(diag::err_drv_invalid_int_value)
725+
<< A->getAsString(Args) << A->getValue();
726+
return 1;
727+
}
728+
729+
return Value;
730+
}
731+
732+
void amdgpu::addFullLTOPartitionOption(const Driver &D,
733+
const llvm::opt::ArgList &Args,
734+
llvm::opt::ArgStringList &CmdArgs) {
735+
// TODO: restrict to gpu-rdc only?
736+
737+
if (unsigned NumParts = GetFullLTOPartitions(D, Args); NumParts > 1) {
738+
CmdArgs.push_back(
739+
Args.MakeArgString("--lto-partitions=" + std::to_string(NumParts)));
740+
}
741+
}
742+
711743
/// AMDGPU Toolchain
712744
AMDGPUToolChain::AMDGPUToolChain(const Driver &D, const llvm::Triple &Triple,
713745
const ArgList &Args)

clang/lib/Driver/ToolChains/AMDGPU.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ void getAMDGPUTargetFeatures(const Driver &D, const llvm::Triple &Triple,
4141
const llvm::opt::ArgList &Args,
4242
std::vector<StringRef> &Features);
4343

44+
void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args,
45+
llvm::opt::ArgStringList &CmdArgs);
4446
} // end namespace amdgpu
4547
} // end namespace tools
4648

clang/lib/Driver/ToolChains/HIPAMD.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
116116

117117
addLinkerCompressDebugSectionsOption(TC, Args, LldArgs);
118118

119+
amdgpu::addFullLTOPartitionOption(D, Args, LldArgs);
120+
119121
// Given that host and device linking happen in separate processes, the device
120122
// linker doesn't always have the visibility as to which device symbols are
121123
// needed by a program, especially for the device symbol dependencies that are

clang/test/Driver/amdgpu-toolchain.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,12 @@
1919
// AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all"
2020

2121
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
22-
// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s
22+
// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s
23+
// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
24+
// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions={{[0-9]+}}"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
25+
2326
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
2427
// RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s
25-
// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
2628
// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
2729

2830
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
@@ -36,3 +38,17 @@
3638
// RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \
3739
// RUN: -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s
3840
// STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
41+
42+
// Check --flto-partitions
43+
44+
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
45+
// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s
46+
// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42"
47+
48+
// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
49+
// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s
50+
// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
51+
52+
// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
53+
// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s
54+
// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// RUN: %clang -### --target=x86_64-linux-gnu \
2+
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \
3+
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
4+
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
5+
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
6+
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
7+
// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS
8+
9+
// FIXED-PARTS-NOT: "*.llvm-link"
10+
// FIXED-PARTS-NOT: ".*opt"
11+
// FIXED-PARTS-NOT: ".*llc"
12+
// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
13+
// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803"
14+
// FIXED-PARTS-SAME: "--lto-partitions=42"
15+
// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}"
16+
17+
// RUN: not %clang -### --target=x86_64-linux-gnu \
18+
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \
19+
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
20+
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
21+
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
22+
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
23+
// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0
24+
25+
// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
26+
27+
// RUN: not %clang -### --target=x86_64-linux-gnu \
28+
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \
29+
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
30+
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
31+
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
32+
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
33+
// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1
34+
35+
// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'

clang/test/Driver/hip-toolchain-rdc-static-lib.hip

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
// CHECK-NOT: ".*llc"
5050
// CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
5151
// CHECK-SAME: "-plugin-opt=mcpu=gfx803"
52+
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
5253
// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
5354

5455
// generate image for device side path on gfx900
@@ -77,6 +78,7 @@
7778
// CHECK-NOT: ".*llc"
7879
// CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
7980
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
81+
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
8082
// CHECK-SAME: "--whole-archive"
8183
// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]]
8284
// CHECK-SAME: "--no-whole-archive"

clang/test/Driver/hip-toolchain-rdc.hip

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@
147147
// CHECK-NOT: ".*llc"
148148
// CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
149149
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
150+
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
150151
// CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]]
151152

152153
// combine images generated into hip fat binary object

0 commit comments

Comments
 (0)