Skip to content

Commit ef8556d

Browse files
authored
(cherry-pick) [clang][AMDGPU] Enable module splitting by default llvm#128509 (llvm#1236)
The default number of partitions is 8. Adds a flto-partitions option to override the number of partitions easily (without having to use -Xoffload-linker). Setting it to 1 effectively disables module splitting. Fixes SWDEV-506214
1 parent c32e705 commit ef8556d

File tree

8 files changed

+106
-16
lines changed

8 files changed

+106
-16
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,6 +1458,8 @@ def fhip_emit_relocatable : Flag<["-"], "fhip-emit-relocatable">,
14581458
HelpText<"Compile HIP source to relocatable">;
14591459
def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
14601460
HelpText<"Do not override toolchain to compile HIP source to relocatable">;
1461+
def flto_partitions_EQ : Joined<["--"], "flto-partitions=">, Group<hip_Group>,
1462+
HelpText<"Number of partitions to use for parallel full LTO codegen. Use 1 to disable partitioning.">;
14611463
}
14621464

14631465
// Clang specific/exclusive options for OpenACC.
@@ -3581,43 +3583,43 @@ def fopenmp_target_xteam_reduction_blocksize_EQ : Joined<["-"], "fopenmp-target-
35813583
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
35823584
def fopenmp_target_fast : Flag<["-"], "fopenmp-target-fast">, Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
35833585
def fno_openmp_target_fast : Flag<["-"], "fno-openmp-target-fast">, Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
3584-
def fopenmp_target_ignore_env_vars : Flag<["-"], "fopenmp-target-ignore-env-vars">, Group<f_Group>,
3586+
def fopenmp_target_ignore_env_vars : Flag<["-"], "fopenmp-target-ignore-env-vars">, Group<f_Group>,
35853587
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
35863588
HelpText<"Assert that device related environment variables can be ignored while generating code">,
35873589
MarshallingInfoFlag<LangOpts<"OpenMPTargetIgnoreEnvVars">>;
3588-
def fno_openmp_target_ignore_env_vars : Flag<["-"], "fno-openmp-target-ignore-env-vars">, Group<f_Group>,
3590+
def fno_openmp_target_ignore_env_vars : Flag<["-"], "fno-openmp-target-ignore-env-vars">, Group<f_Group>,
35893591
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
35903592
HelpText<"Assert that device related environment variables cannot be ignored while generating code">,
35913593
MarshallingInfoFlag<LangOpts<"OpenMPTargetIgnoreEnvVars">>;
3592-
def fopenmp_target_big_jump_loop : Flag<["-"], "fopenmp-target-big-jump-loop">, Group<f_Group>,
3594+
def fopenmp_target_big_jump_loop : Flag<["-"], "fopenmp-target-big-jump-loop">, Group<f_Group>,
35933595
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
35943596
HelpText<"Use the big-jump-loop code generation technique if possible">,
35953597
MarshallingInfoFlag<LangOpts<"OpenMPTargetBigJumpLoop">>;
3596-
def fno_openmp_target_big_jump_loop : Flag<["-"], "fno-openmp-target-big-jump-loop">, Group<f_Group>,
3598+
def fno_openmp_target_big_jump_loop : Flag<["-"], "fno-openmp-target-big-jump-loop">, Group<f_Group>,
35973599
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
35983600
HelpText<"Do not use the big-jump-loop code generation technique">,
35993601
MarshallingInfoFlag<LangOpts<"OpenMPTargetBigJumpLoop">>;
3600-
def fopenmp_target_no_loop : Flag<["-"], "fopenmp-target-no-loop">, Group<f_Group>,
3602+
def fopenmp_target_no_loop : Flag<["-"], "fopenmp-target-no-loop">, Group<f_Group>,
36013603
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
36023604
HelpText<"Use the no-loop code generation technique if possible">,
36033605
MarshallingInfoFlag<LangOpts<"OpenMPTargetNoLoop">>;
3604-
def fno_openmp_target_no_loop : Flag<["-"], "fno-openmp-target-no-loop">, Group<f_Group>,
3606+
def fno_openmp_target_no_loop : Flag<["-"], "fno-openmp-target-no-loop">, Group<f_Group>,
36053607
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
36063608
HelpText<"Do not use the no-loop code generation technique">,
36073609
MarshallingInfoFlag<LangOpts<"OpenMPTargetNoLoop">>;
3608-
def fopenmp_target_xteam_reduction : Flag<["-"], "fopenmp-target-xteam-reduction">, Group<f_Group>,
3610+
def fopenmp_target_xteam_reduction : Flag<["-"], "fopenmp-target-xteam-reduction">, Group<f_Group>,
36093611
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
36103612
HelpText<"Use the cross-team code generation technique if possible">,
36113613
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamReduction">>;
3612-
def fno_openmp_target_xteam_reduction : Flag<["-"], "fno-openmp-target-xteam-reduction">, Group<f_Group>,
3614+
def fno_openmp_target_xteam_reduction : Flag<["-"], "fno-openmp-target-xteam-reduction">, Group<f_Group>,
36133615
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
36143616
HelpText<"Do not use the cross-team reduction code generation technique">,
36153617
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamReduction">>;
3616-
def fopenmp_target_fast_reduction : Flag<["-"], "fopenmp-target-fast-reduction">, Group<f_Group>,
3618+
def fopenmp_target_fast_reduction : Flag<["-"], "fopenmp-target-fast-reduction">, Group<f_Group>,
36173619
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
36183620
HelpText<"Use the fast reduction code generation technique if possible">,
36193621
MarshallingInfoFlag<LangOpts<"OpenMPTargetFastReduction">>;
3620-
def fno_openmp_target_fast_reduction : Flag<["-"], "fno-openmp-target-fast-reduction">, Group<f_Group>,
3622+
def fno_openmp_target_fast_reduction : Flag<["-"], "fno-openmp-target-fast-reduction">, Group<f_Group>,
36213623
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
36223624
HelpText<"Do not use the fast reduction code generation technique">,
36233625
MarshallingInfoFlag<LangOpts<"OpenMPTargetFastReduction">>;

clang/lib/Driver/ToolChains/AMDGPU.cpp

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -628,10 +628,13 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
628628
Args.AddAllArgs(CmdArgs, options::OPT_L);
629629
getToolChain().AddFilePathLibArgs(Args, CmdArgs);
630630
AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
631-
if (C.getDriver().isUsingLTO())
632-
addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
633-
C.getDriver().getLTOMode() == LTOK_Thin);
634-
else if (Args.hasArg(options::OPT_mcpu_EQ))
631+
if (C.getDriver().isUsingLTO()) {
632+
const bool ThinLTO = (C.getDriver().getLTOMode() == LTOK_Thin);
633+
addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0], ThinLTO);
634+
635+
if (!ThinLTO)
636+
addFullLTOPartitionOption(C.getDriver(), Args, CmdArgs);
637+
} else if (Args.hasArg(options::OPT_mcpu_EQ))
635638
CmdArgs.push_back(Args.MakeArgString(
636639
"-plugin-opt=mcpu=" + Args.getLastArgValue(options::OPT_mcpu_EQ)));
637640
CmdArgs.push_back("-o");
@@ -641,6 +644,33 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
641644
CmdArgs, Inputs, Output));
642645
}
643646

647+
static unsigned getFullLTOPartitions(const Driver &D, const ArgList &Args) {
648+
const Arg *A = Args.getLastArg(options::OPT_flto_partitions_EQ);
649+
// In the absence of an option, use 8 as the default.
650+
if (!A)
651+
return 8;
652+
int Value = 0;
653+
if (StringRef(A->getValue()).getAsInteger(10, Value) || (Value < 1)) {
654+
D.Diag(diag::err_drv_invalid_int_value)
655+
<< A->getAsString(Args) << A->getValue();
656+
return 1;
657+
}
658+
659+
return Value;
660+
}
661+
662+
void amdgpu::addFullLTOPartitionOption(const Driver &D,
663+
const llvm::opt::ArgList &Args,
664+
llvm::opt::ArgStringList &CmdArgs) {
665+
// TODO: Should this be restricted to fgpu-rdc only ? Currently we'll
666+
// also do it for non gpu-rdc LTO
667+
668+
if (unsigned NumParts = getFullLTOPartitions(D, Args); NumParts > 1) {
669+
CmdArgs.push_back(
670+
Args.MakeArgString("--lto-partitions=" + Twine(NumParts)));
671+
}
672+
}
673+
644674
void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
645675
const llvm::Triple &Triple,
646676
const llvm::opt::ArgList &Args,

clang/lib/Driver/ToolChains/AMDGPU.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ const char *getLldCommandArgs(
8282
const std::optional<std::string> OutputFilePrefix = std::nullopt);
8383
} // end namespace dlr
8484

85+
void addFullLTOPartitionOption(const Driver &D, const llvm::opt::ArgList &Args,
86+
llvm::opt::ArgStringList &CmdArgs);
87+
8588
} // end namespace amdgpu
8689
} // end namespace tools
8790

clang/lib/Driver/ToolChains/HIPAMD.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
160160

161161
addLinkerCompressDebugSectionsOption(TC, Args, LldArgs);
162162

163+
amdgpu::addFullLTOPartitionOption(D, Args, LldArgs);
164+
163165
// Given that host and device linking happen in separate processes, the device
164166
// linker doesn't always have the visibility as to which device symbols are
165167
// needed by a program, especially for the device symbol dependencies that are

clang/test/Driver/amdgpu-toolchain.c

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,27 @@
1919
// AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all"
2020

2121
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
22-
// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s
22+
// RUN: -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=LTO %s
23+
// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
24+
// LTO: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx906"{{.*}}"--lto-partitions={{[0-9]+}}"
2325
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
2426
// RUN: -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s
25-
// LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
2627
// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx906"
2728

2829
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
2930
// RUN: -fuse-ld=ld %s 2>&1 | FileCheck -check-prefixes=LD %s
3031
// LD: ld.lld
32+
33+
// Check --flto-partitions
34+
35+
// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
36+
// RUN: -L. -flto --flto-partitions=42 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS %s
37+
// LTO_PARTS: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"--lto-partitions=42"
38+
39+
// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
40+
// RUN: -L. -flto --flto-partitions=a %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV0 %s
41+
// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
42+
43+
// RUN: not %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a -nogpulib \
44+
// RUN: -L. -flto --flto-partitions=0 %s 2>&1 | FileCheck -check-prefix=LTO_PARTS_INV1 %s
45+
// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// RUN: %clang -### --target=x86_64-linux-gnu \
2+
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=42 \
3+
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
4+
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
5+
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
6+
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
7+
// RUN: 2>&1 | FileCheck %s --check-prefix=FIXED-PARTS
8+
9+
// FIXED-PARTS-NOT: "*.llvm-link"
10+
// FIXED-PARTS-NOT: ".*opt"
11+
// FIXED-PARTS-NOT: ".*llc"
12+
// FIXED-PARTS: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
13+
// FIXED-PARTS-SAME: "-plugin-opt=mcpu=gfx803"
14+
// FIXED-PARTS-SAME: "--lto-partitions=42"
15+
// FIXED-PARTS-SAME: "-o" "{{.*out}}" "{{.*bc}}"
16+
17+
// RUN: not %clang -### --target=x86_64-linux-gnu \
18+
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=a \
19+
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
20+
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
21+
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
22+
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
23+
// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV0
24+
25+
// LTO_PARTS_INV0: clang: error: invalid integral value 'a' in '--flto-partitions=a'
26+
27+
// RUN: not %clang -### --target=x86_64-linux-gnu \
28+
// RUN: -x hip --cuda-gpu-arch=gfx803 --flto-partitions=0 \
29+
// RUN: --no-offload-new-driver --emit-static-lib -nogpulib \
30+
// RUN: -fuse-ld=lld -B%S/Inputs/lld -fgpu-rdc -nogpuinc \
31+
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
32+
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
33+
// RUN: 2>&1 | FileCheck %s --check-prefix=LTO_PARTS_INV1
34+
35+
// LTO_PARTS_INV1: clang: error: invalid integral value '0' in '--flto-partitions=0'

clang/test/Driver/hip-toolchain-rdc-static-lib.hip

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
// CHECK-NOT: ".*llc"
5050
// CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
5151
// CHECK-SAME: "-plugin-opt=mcpu=gfx803"
52+
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
5253
// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
5354

5455
// generate image for device side path on gfx900
@@ -77,6 +78,7 @@
7778
// CHECK-NOT: ".*llc"
7879
// CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
7980
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
81+
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
8082
// CHECK-SAME: "--whole-archive"
8183
// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]]
8284
// CHECK-SAME: "--no-whole-archive"

clang/test/Driver/hip-toolchain-rdc.hip

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@
147147
// CHECK-NOT: ".*llc"
148148
// CHECK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
149149
// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
150+
// CHECK-SAME: "--lto-partitions={{[0-9]+}}"
150151
// CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]]
151152

152153
// combine images generated into hip fat binary object

0 commit comments

Comments
 (0)