Skip to content

Commit 184d258

Browse files
[SYCL] Add new auto device code split mode (intel#2827)
This patch introduces new device code split mode `auto`, which is intended to automatically select the best device code split mode and apply it. At the moment, `auto` is equivalent to `per_source` for most cases and it is equivalent to `off` in case of presence of function pointers.
1 parent 0b4e215 commit 184d258

File tree

8 files changed

+374
-21
lines changed

8 files changed

+374
-21
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2220,11 +2220,11 @@ def fsycl_link_targets_EQ : CommaJoined<["-"], "fsycl-link-targets=">, Flags<[No
22202220
HelpText<"Specify comma-separated list of triples SYCL offloading targets to produce linked device images">;
22212221
def fsycl_device_code_split_EQ : Joined<["-"], "fsycl-device-code-split=">,
22222222
Flags<[CC1Option, CoreOption]>, HelpText<"Perform SYCL device code split: per_kernel (device code module is "
2223-
"created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). "
2224-
"Default is 'off' - all kernels go into a single module`">, Values<"per_source, per_kernel, off">;
2223+
"created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). | auto (use heuristic to select the best way of splitting device code)"
2224+
"Default is 'auto' - use heuristic to distribute device code across modules">, Values<"per_source, per_kernel, off, auto">;
22252225
def fsycl_device_code_split : Flag<["-"], "fsycl-device-code-split">, Alias<fsycl_device_code_split_EQ>,
2226-
AliasArgs<["per_source"]>, Flags<[CC1Option, CoreOption]>,
2227-
HelpText<"Perform SYCL device code split in the per_source mode i.e. create a device code module for each source (translation unit)">;
2226+
AliasArgs<["auto"]>, Flags<[CC1Option, CoreOption]>,
2227+
HelpText<"Perform SYCL device code split in the 'auto' mode, i.e. use heuristic to distribute device code across modules">;
22282228
def fsycl_id_queries_fit_in_int : Flag<["-"], "fsycl-id-queries-fit-in-int">,
22292229
Flags<[CC1Option, CoreOption]>, HelpText<"Assume that SYCL ID queries fit "
22302230
"within MAX_INT.">;

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8143,12 +8143,17 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA,
81438143
addArgs(CmdArgs, TCArgs, {"-split=kernel"});
81448144
else if (StringRef(A->getValue()) == "per_source")
81458145
addArgs(CmdArgs, TCArgs, {"-split=source"});
8146+
else if (StringRef(A->getValue()) == "auto")
8147+
addArgs(CmdArgs, TCArgs, {"-split=auto"});
81468148
else
81478149
// split must be off
81488150
assert(StringRef(A->getValue()) == "off");
8151+
} else {
8152+
// auto is the default split mode
8153+
addArgs(CmdArgs, TCArgs, {"-split=auto"});
81498154
}
81508155
// OPT_fsycl_device_code_split is not checked as it is an alias to
8151-
// -fsycl-device-code-split=per_source
8156+
// -fsycl-device-code-split=auto
81528157

81538158
// Turn on Dead Parameter Elimination Optimization with early optimizations
81548159
if (!getToolChain().getTriple().isNVPTX() &&

clang/test/Driver/sycl-offload-with-split.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@
206206
// RUN: | FileCheck %s -check-prefixes=CHK-TOOLS-AOT,CHK-TOOLS-CPU
207207
// CHK-TOOLS-AOT: clang{{.*}} "-fsycl-is-device" {{.*}} "-o" "[[OUTPUT1:.+\.bc]]"
208208
// CHK-TOOLS-AOT: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2:.+\.bc]]"
209-
// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-spec-const=default" "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]"
209+
// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-split=auto" {{.*}} "-spec-const=default" "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]"
210210
// CHK-TOOLS-AOT: file-table-tform{{.*}} "-o" "[[OUTPUT4:.+\.txt]]" "[[OUTPUT3]]"
211211
// CHK-TOOLS-AOT: llvm-foreach{{.*}} "--in-file-list=[[OUTPUT4]]" "--in-replace=[[OUTPUT4]]" "--out-ext=spv" "--out-file-list=[[OUTPUT5:.+\.txt]]" "--out-replace=[[OUTPUT5]]" "--" "{{.*}}llvm-spirv{{.*}}" "-o" "[[OUTPUT5]]" {{.*}} "[[OUTPUT4]]"
212212
// CHK-TOOLS-FPGA: llvm-foreach{{.*}} "--out-file-list=[[OUTPUT6:.+\.txt]]{{.*}} "--" "{{.*}}aoc{{.*}} "-o" "[[OUTPUT6]]" "[[OUTPUT5]]"
@@ -271,13 +271,33 @@
271271
// CHK-PHASE-MULTI-TARG: 36: clang-offload-wrapper, {35}, object, (device-sycl)
272272
// CHK-PHASE-MULTI-TARG: 37: offload, "host-sycl (x86_64-unknown-linux-gnu)" {9}, "device-sycl (spir64-unknown-unknown-sycldevice)" {18}, "device-sycl (spir64_fpga-unknown-unknown-sycldevice)" {28}, "device-sycl (spir64_gen-unknown-unknown-sycldevice)" {36}, image
273273

274-
// Check -fsycl-one-kernel-per-module option passing.
274+
// Check -fsycl-device-code-split=per_kernel option passing.
275275
// RUN: %clang -### -fsycl -fsycl-device-code-split=per_kernel %s 2>&1 \
276276
// RUN: | FileCheck %s -check-prefixes=CHK-ONE-KERNEL
277277
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=per_kernel %s 2>&1 \
278278
// RUN: | FileCheck %s -check-prefixes=CHK-ONE-KERNEL
279279
// CHK-ONE-KERNEL: sycl-post-link{{.*}} "-split=kernel"{{.*}} "-o"{{.*}}
280280

281+
// Check -fsycl-device-code-split=per_source option passing.
282+
// RUN: %clang -### -fsycl -fsycl-device-code-split=per_source %s 2>&1 \
283+
// RUN: | FileCheck %s -check-prefixes=CHK-PER-SOURCE
284+
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=per_source %s 2>&1 \
285+
// RUN: | FileCheck %s -check-prefixes=CHK-PER-SOURCE
286+
// CHK-PER-SOURCE: sycl-post-link{{.*}} "-split=source"{{.*}} "-o"{{.*}}
287+
288+
// Check -fsycl-device-code-split option passing.
289+
// RUN: %clang -### -fsycl -fsycl-device-code-split %s 2>&1 \
290+
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
291+
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split %s 2>&1 \
292+
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
293+
// RUN: %clang -### -fsycl -fsycl-device-code-split=auto %s 2>&1 \
294+
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
295+
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=auto %s 2>&1 \
296+
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
297+
// RUN: %clang -### -fsycl %s 2>&1 | FileCheck %s -check-prefixes=CHK-AUTO
298+
// RUN: %clang_cl -### -fsycl %s 2>&1 | FileCheck %s -check-prefixes=CHK-AUTO
299+
// CHK-AUTO: sycl-post-link{{.*}} "-split=auto"{{.*}} "-o"{{.*}}
300+
281301
// Check no device code split mode.
282302
// RUN: %clang -### -fsycl -fsycl-device-code-split -fsycl-device-code-split=off %s 2>&1 \
283303
// RUN: | FileCheck %s -check-prefixes=CHK-NO-SPLIT
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
2+
; By default auto mode is equal to source mode
3+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK
4+
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK
5+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
6+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
7+
8+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
9+
target triple = "spir64-unknown-linux-sycldevice"
10+
11+
$_Z3barIiET_S0_ = comdat any
12+
13+
; CHECK-TU0-NOT: @{{.*}}GV{{.*}}
14+
; CHECK-TU1: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4
15+
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
16+
17+
; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
18+
; CHECK-TU0-TXT: {{.*}}TU0_kernel0{{.*}}
19+
; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
20+
; CHECK-TU1-TXT-NOT: {{.*}}TU0_kernel0{{.*}}
21+
22+
; CHECK-TU0: call spir_func void @{{.*}}foo{{.*}}()
23+
24+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
25+
entry:
26+
call spir_func void @_Z3foov()
27+
ret void
28+
}
29+
30+
; CHECK-TU0: define dso_local spir_func void @{{.*}}foo{{.*}}()
31+
; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo{{.*}}()
32+
33+
; CHECK-TU0: call spir_func i32 @{{.*}}bar{{.*}}(i32 1)
34+
35+
define dso_local spir_func void @_Z3foov() {
36+
entry:
37+
%a = alloca i32, align 4
38+
%call = call spir_func i32 @_Z3barIiET_S0_(i32 1)
39+
%add = add nsw i32 2, %call
40+
store i32 %add, i32* %a, align 4
41+
ret void
42+
}
43+
44+
; CHECK-TU0: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
45+
; CHECK-TU1-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
46+
47+
; Function Attrs: nounwind
48+
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
49+
entry:
50+
%arg.addr = alloca i32, align 4
51+
store i32 %arg, i32* %arg.addr, align 4
52+
%0 = load i32, i32* %arg.addr, align 4
53+
ret i32 %0
54+
}
55+
56+
; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
57+
; CHECK-TU0-TXT: {{.*}}TU0_kernel1{{.*}}
58+
; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
59+
; CHECK-TU1-TXT-NOT: {{.*}}TU0_kernel1{{.*}}
60+
61+
; CHECK-TU0: call spir_func void @{{.*}}foo1{{.*}}()
62+
63+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
64+
entry:
65+
call spir_func void @_Z4foo1v()
66+
ret void
67+
}
68+
69+
; CHECK-TU0: define dso_local spir_func void @{{.*}}foo1{{.*}}()
70+
; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo1{{.*}}()
71+
72+
; Function Attrs: nounwind
73+
define dso_local spir_func void @_Z4foo1v() {
74+
entry:
75+
%a = alloca i32, align 4
76+
store i32 2, i32* %a, align 4
77+
ret void
78+
}
79+
80+
; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
81+
; CHECK-TU0-TXT-NOT: {{.*}}TU1_kernel{{.*}}
82+
; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
83+
; CHECK-TU1-TXT: {{.*}}TU1_kernel{{.*}}
84+
85+
; CHECK-TU1: call spir_func void @{{.*}}foo2{{.*}}()
86+
87+
define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
88+
entry:
89+
call spir_func void @_Z4foo2v()
90+
ret void
91+
}
92+
93+
; CHECK-TU0-NOT: define dso_local spir_func void @{{.*}}foo2{{.*}}()
94+
; CHECK-TU1: define dso_local spir_func void @{{.*}}foo2{{.*}}()
95+
96+
; Function Attrs: nounwind
97+
define dso_local spir_func void @_Z4foo2v() {
98+
entry:
99+
%a = alloca i32, align 4
100+
; CHECK-TU1: %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @{{.*}}GV{{.*}} to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
101+
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
102+
%add = add nsw i32 4, %0
103+
store i32 %add, i32* %a, align 4
104+
ret void
105+
}
106+
107+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
108+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
109+
110+
; Metadata is saved in both modules.
111+
; CHECK: !opencl.spir.version = !{!0, !0}
112+
; CHECK: !spirv.Source = !{!1, !1}
113+
114+
!opencl.spir.version = !{!0, !0}
115+
!spirv.Source = !{!1, !1}
116+
117+
; CHECK: !0 = !{i32 1, i32 2}
118+
; CHECK: !1 = !{i32 4, i32 100000}
119+
120+
!0 = !{i32 1, i32 2}
121+
!1 = !{i32 4, i32 100000}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
2+
; In precense of indirectly callable function auto mode is equal to no split,
3+
; which means that separate LLVM IR file for device is not generated and we only
4+
; need to check generated symbol table
5+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK
6+
7+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
8+
target triple = "spir64-unknown-linux-sycldevice"
9+
10+
$_Z3barIiET_S0_ = comdat any
11+
12+
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
13+
14+
; CHECK: {{.*}}TU0_kernel0{{.*}}
15+
16+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
17+
entry:
18+
call spir_func void @_Z3foov()
19+
ret void
20+
}
21+
22+
define dso_local spir_func void @_Z3foov() #2 {
23+
entry:
24+
%a = alloca i32, align 4
25+
%call = call spir_func i32 @_Z3barIiET_S0_(i32 1)
26+
%add = add nsw i32 2, %call
27+
store i32 %add, i32* %a, align 4
28+
ret void
29+
}
30+
31+
; Function Attrs: nounwind
32+
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
33+
entry:
34+
%arg.addr = alloca i32, align 4
35+
store i32 %arg, i32* %arg.addr, align 4
36+
%0 = load i32, i32* %arg.addr, align 4
37+
ret i32 %0
38+
}
39+
40+
; CHECK: {{.*}}TU0_kernel1{{.*}}
41+
42+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
43+
entry:
44+
call spir_func void @_Z4foo1v()
45+
ret void
46+
}
47+
48+
; Function Attrs: nounwind
49+
define dso_local spir_func void @_Z4foo1v() {
50+
entry:
51+
%a = alloca i32, align 4
52+
store i32 2, i32* %a, align 4
53+
ret void
54+
}
55+
; CHECK: {{.*}}TU1_kernel{{.*}}
56+
57+
define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
58+
entry:
59+
call spir_func void @_Z4foo2v()
60+
ret void
61+
}
62+
63+
; Function Attrs: nounwind
64+
define dso_local spir_func void @_Z4foo2v() {
65+
entry:
66+
%a = alloca i32, align 4
67+
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
68+
%add = add nsw i32 4, %0
69+
store i32 %add, i32* %a, align 4
70+
ret void
71+
}
72+
73+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
74+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
75+
attributes #2 = { "referenced-indirectly" }
76+
77+
!opencl.spir.version = !{!0, !0}
78+
!spirv.Source = !{!1, !1}
79+
80+
!0 = !{i32 1, i32 2}
81+
!1 = !{i32 4, i32 100000}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
2+
; In precense of indirect calls auto mode is equal to no split,
3+
; which means that separate LLVM IR file for device is not generated and we only
4+
; need to check generated symbol table
5+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK
6+
7+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
8+
target triple = "spir64-unknown-linux-sycldevice"
9+
10+
$_Z3barIiET_S0_ = comdat any
11+
12+
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
13+
14+
; CHECK: {{.*}}TU0_kernel0{{.*}}
15+
16+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
17+
entry:
18+
call spir_func void @_Z3foov()
19+
ret void
20+
}
21+
22+
define dso_local spir_func void @_Z3foov() {
23+
entry:
24+
%a = alloca i32, align 4
25+
%ptr = bitcast i32* %a to i32 (i32)*
26+
%call = call spir_func i32 %ptr(i32 1)
27+
%add = add nsw i32 2, %call
28+
store i32 %add, i32* %a, align 4
29+
ret void
30+
}
31+
32+
; Function Attrs: nounwind
33+
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
34+
entry:
35+
%arg.addr = alloca i32, align 4
36+
store i32 %arg, i32* %arg.addr, align 4
37+
%0 = load i32, i32* %arg.addr, align 4
38+
ret i32 %0
39+
}
40+
41+
; CHECK: {{.*}}TU0_kernel1{{.*}}
42+
43+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
44+
entry:
45+
call spir_func void @_Z4foo1v()
46+
ret void
47+
}
48+
49+
; Function Attrs: nounwind
50+
define dso_local spir_func void @_Z4foo1v() {
51+
entry:
52+
%a = alloca i32, align 4
53+
store i32 2, i32* %a, align 4
54+
ret void
55+
}
56+
; CHECK: {{.*}}TU1_kernel{{.*}}
57+
58+
define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
59+
entry:
60+
call spir_func void @_Z4foo2v()
61+
ret void
62+
}
63+
64+
; Function Attrs: nounwind
65+
define dso_local spir_func void @_Z4foo2v() {
66+
entry:
67+
%a = alloca i32, align 4
68+
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
69+
%add = add nsw i32 4, %0
70+
store i32 %add, i32* %a, align 4
71+
ret void
72+
}
73+
74+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
75+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
76+
77+
!opencl.spir.version = !{!0, !0}
78+
!spirv.Source = !{!1, !1}
79+
80+
!0 = !{i32 1, i32 2}
81+
!1 = !{i32 4, i32 100000}

llvm/test/tools/sycl-post-link/help.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,5 @@ CHECK: =default - set spec constants to C++ defaults
5252
CHECK: --split=<value> - split input module
5353
CHECK: =source - 1 output module per source (translation unit)
5454
CHECK: =kernel - 1 output module per kernel
55+
CHECK: =auto - Choose split mode automatically
5556
CHECK: --symbols - generate exported symbol files

0 commit comments

Comments
 (0)