Skip to content

Commit 6c25662

Browse files
committed
[AMDGPU] Add AMDGPU-specific module splitting
This enables the --lto-partitions option to work more consistently. This module splitting logic is fully aware of AMDGPU modules and their specificities and takes advantage of them to split modules in a way that avoids compilation issue (such as resource usage being incorrectly represented). This also includes a logging system that's more elaborate than just LLVM_DEBUG which allows printing logs to uniquely named files, and optionally with all value names hidden so they can be safely shared without leaking informatiton about the source. Logs can also be enabled through an environment variable, which avoids the sometimes complicated process of passing a -mllvm option all the way from clang driver to the offload linker that handles full LTO codegen.
1 parent 6bb5065 commit 6c25662

18 files changed

+1416
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp

Lines changed: 733 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H
12+
#define LLVM_TARGET_AMDGPUSPLITMODULE_H
13+
14+
#include "llvm/ADT/STLFunctionalExtras.h"
15+
#include <memory>
16+
17+
namespace llvm {
18+
19+
class Module;
20+
class AMDGPUTargetMachine;
21+
22+
/// Splits the module M into N linkable partitions. The function ModuleCallback
23+
/// is called N times passing each individual partition as the MPart argument.
24+
void splitAMDGPUModule(
25+
const AMDGPUTargetMachine &TM, Module &M, unsigned N,
26+
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);
27+
28+
} // end namespace llvm
29+
30+
#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "AMDGPUIGroupLP.h"
2121
#include "AMDGPUMacroFusion.h"
2222
#include "AMDGPURegBankSelect.h"
23+
#include "AMDGPUSplitModule.h"
2324
#include "AMDGPUTargetObjectFile.h"
2425
#include "AMDGPUTargetTransformInfo.h"
2526
#include "AMDGPUUnifyDivergentExitNodes.h"
@@ -806,6 +807,13 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
806807
return AMDGPUAS::FLAT_ADDRESS;
807808
}
808809

810+
bool AMDGPUTargetMachine::splitModule(
811+
Module &M, unsigned NumParts,
812+
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
813+
splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
814+
return true;
815+
}
816+
809817
//===----------------------------------------------------------------------===//
810818
// GCN Target Machine (SI+)
811819
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
6767
getPredicatedAddrSpace(const Value *V) const override;
6868

6969
unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
70+
71+
bool splitModule(Module &M, unsigned NumParts,
72+
function_ref<void(std::unique_ptr<Module> MPart)>
73+
ModuleCallback) const override;
7074
};
7175

7276
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ add_llvm_target(AMDGPUCodeGen
9797
AMDGPURewriteOutArguments.cpp
9898
AMDGPURewriteUndefForPHI.cpp
9999
AMDGPUSetWavePriority.cpp
100+
AMDGPUSplitModule.cpp
100101
AMDGPUSubtarget.cpp
101102
AMDGPUTargetMachine.cpp
102103
AMDGPUTargetObjectFile.cpp
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtarget amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels:
7+
; - A does a direct call to HelperA
8+
; - B is storing @HelperA
9+
; - C does a direct call to HelperA
10+
;
11+
; The helper functions will get externalized, which will force A and C into P0 as
12+
; external functions cannot be duplicated.
13+
14+
; CHECK0: define hidden void @HelperA()
15+
; CHECK0: define amdgpu_kernel void @A()
16+
; CHECK0: declare amdgpu_kernel void @B(ptr)
17+
; CHECK0: define amdgpu_kernel void @C()
18+
19+
; CHECK1: declare hidden void @HelperA()
20+
; CHECK1: declare amdgpu_kernel void @A()
21+
; CHECK1: declare amdgpu_kernel void @B(ptr)
22+
; CHECK1: declare amdgpu_kernel void @C()
23+
24+
; CHECK2: declare hidden void @HelperA()
25+
; CHECK2: declare amdgpu_kernel void @A()
26+
; CHECK2: define amdgpu_kernel void @B(ptr %dst)
27+
; CHECK2: declare amdgpu_kernel void @C()
28+
29+
define internal void @HelperA() {
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @A() {
34+
call void @HelperA()
35+
ret void
36+
}
37+
38+
define amdgpu_kernel void @B(ptr %dst) {
39+
store ptr @HelperA, ptr %dst
40+
ret void
41+
}
42+
43+
define amdgpu_kernel void @C() {
44+
call void @HelperA()
45+
ret void
46+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; RUN: llvm-split -o %t %s -j 2 -mtarget amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
5+
; 2 kernels:
6+
; - A is isolated
7+
; - B is storing @HelperA/B's address
8+
;
9+
; The helper functions should get externalized (become hidden w/ external linkage)
10+
11+
; CHECK0: define hidden void @HelperA()
12+
; CHECK0: define hidden void @HelperB()
13+
; CHECK0: define amdgpu_kernel void @A()
14+
; CHECK0: declare amdgpu_kernel void @B(i1, ptr)
15+
16+
; CHECK1: declare hidden void @HelperA()
17+
; CHECK1: declare hidden void @HelperB()
18+
; CHECK1: declare amdgpu_kernel void @A()
19+
; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst)
20+
21+
define internal void @HelperA() {
22+
ret void
23+
}
24+
25+
define internal void @HelperB() {
26+
ret void
27+
}
28+
29+
define amdgpu_kernel void @A() {
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @B(i1 %cond, ptr %dst) {
34+
%addr = select i1 %cond, ptr @HelperA, ptr @HelperB
35+
store ptr %addr, ptr %dst
36+
ret void
37+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtarget amdgcn-amd-amdhsa
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels with each their own dependencies should go into 3
7+
; distinct partitions. The most expensive kernel should be
8+
; seen first and go into the last partition.
9+
10+
; CHECK0-NOT: define
11+
; CHECK0: define amdgpu_kernel void @C
12+
; CHECK0: define internal void @HelperC
13+
; CHECK0-NOT: define
14+
15+
; CHECK1-NOT: define
16+
; CHECK1: define amdgpu_kernel void @A
17+
; CHECK1: define internal void @HelperA
18+
; CHECK1-NOT: define
19+
20+
; CHECK2-NOT: define
21+
; CHECK2: define amdgpu_kernel void @B
22+
; CHECK2: define internal void @HelperB
23+
; CHECK2-NOT: define
24+
25+
26+
define amdgpu_kernel void @A() {
27+
call void @HelperA()
28+
ret void
29+
}
30+
31+
define internal void @HelperA() {
32+
ret void
33+
}
34+
35+
define amdgpu_kernel void @B() {
36+
store i64 42, ptr undef
37+
store i64 43, ptr undef
38+
store i64 44, ptr undef
39+
call void @HelperB()
40+
ret void
41+
}
42+
43+
define internal void @HelperB() {
44+
ret void
45+
}
46+
47+
define amdgpu_kernel void @C() {
48+
call void @HelperC()
49+
ret void
50+
}
51+
52+
define internal void @HelperC() {
53+
ret void
54+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtarget amdgcn-amd-amdhsa
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels with each their own dependencies should go into 3
7+
; distinct partitions.
8+
9+
; CHECK0-NOT: define
10+
; CHECK0: define amdgpu_kernel void @C
11+
; CHECK0: define internal void @HelperC
12+
; CHECK0-NOT: define
13+
14+
; CHECK1-NOT: define
15+
; CHECK1: define amdgpu_kernel void @B
16+
; CHECK1: define internal void @HelperB
17+
; CHECK1-NOT: define
18+
19+
; CHECK2-NOT: define
20+
; CHECK2: define amdgpu_kernel void @A
21+
; CHECK2: define internal void @HelperA
22+
; CHECK2-NOT: define
23+
24+
25+
define amdgpu_kernel void @A() {
26+
call void @HelperA()
27+
ret void
28+
}
29+
30+
define internal void @HelperA() {
31+
ret void
32+
}
33+
34+
define amdgpu_kernel void @B() {
35+
call void @HelperB()
36+
ret void
37+
}
38+
39+
define internal void @HelperB() {
40+
ret void
41+
}
42+
43+
define amdgpu_kernel void @C() {
44+
call void @HelperC()
45+
ret void
46+
}
47+
48+
define internal void @HelperC() {
49+
ret void
50+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtarget amdgcn-amd-amdhsa
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels share a common helper, that helper should be
7+
; cloned in all partitions.
8+
9+
; CHECK0-NOT: define
10+
; CHECK0: define internal void @Helper
11+
; CHECK0: define amdgpu_kernel void @C
12+
; CHECK0-NOT: define
13+
14+
; CHECK1-NOT: define
15+
; CHECK1: define internal void @Helper
16+
; CHECK1: define amdgpu_kernel void @B
17+
; CHECK1-NOT: define
18+
19+
; CHECK2-NOT: define
20+
; CHECK2: define internal void @Helper
21+
; CHECK2: define amdgpu_kernel void @A
22+
; CHECK2-NOT: define
23+
24+
define internal void @Helper() {
25+
ret void
26+
}
27+
28+
define amdgpu_kernel void @A() {
29+
call void @Helper()
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @B() {
34+
call void @Helper()
35+
ret void
36+
}
37+
38+
define amdgpu_kernel void @C() {
39+
call void @Helper()
40+
ret void
41+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtarget amdgcn-amd-amdhsa
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels use private/internal global variables.
7+
; The GVs should be copied in each partition as needed.
8+
9+
; CHECK0-NOT: define
10+
; CHECK0: define void @ExternalHelper
11+
; CHECK0: define amdgpu_kernel void @A
12+
; CHECK0: define amdgpu_kernel void @B
13+
; CHECK0-NOT: define
14+
15+
; CHECK1-NOT: define
16+
; CHECK1: define amdgpu_kernel void @D
17+
; CHECK1-NOT: define
18+
19+
; CHECK2-NOT: define
20+
; CHECK2: define amdgpu_kernel void @C
21+
; CHECK2-NOT: define
22+
23+
define void @ExternalHelper() {
24+
ret void
25+
}
26+
27+
define amdgpu_kernel void @A() {
28+
call void @ExternalHelper()
29+
ret void
30+
}
31+
32+
define amdgpu_kernel void @B() {
33+
call void @ExternalHelper()
34+
ret void
35+
}
36+
37+
define amdgpu_kernel void @C() {
38+
ret void
39+
}
40+
41+
define amdgpu_kernel void @D() {
42+
ret void
43+
}

0 commit comments

Comments
 (0)