Skip to content

Commit 73faebb

Browse files
committed
Improve implementation and tests
Added LIT tests. Update heuristic to disable device code split when indirect calls are present in the input module.
1 parent d29970f commit 73faebb

File tree

6 files changed

+324
-7
lines changed

6 files changed

+324
-7
lines changed

clang/test/Driver/sycl-offload-with-split.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@
206206
// RUN: | FileCheck %s -check-prefixes=CHK-TOOLS-AOT,CHK-TOOLS-CPU
207207
// CHK-TOOLS-AOT: clang{{.*}} "-fsycl-is-device" {{.*}} "-o" "[[OUTPUT1:.+\.bc]]"
208208
// CHK-TOOLS-AOT: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2:.+\.bc]]"
209-
// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-spec-const=default" "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]"
209+
// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-split=auto" {{.*}} "-spec-const=default" "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]"
210210
// CHK-TOOLS-AOT: file-table-tform{{.*}} "-o" "[[OUTPUT4:.+\.txt]]" "[[OUTPUT3]]"
211211
// CHK-TOOLS-AOT: llvm-foreach{{.*}} "--in-file-list=[[OUTPUT4]]" "--in-replace=[[OUTPUT4]]" "--out-ext=spv" "--out-file-list=[[OUTPUT5:.+\.txt]]" "--out-replace=[[OUTPUT5]]" "--" "{{.*}}llvm-spirv{{.*}}" "-o" "[[OUTPUT5]]" {{.*}} "[[OUTPUT4]]"
212212
// CHK-TOOLS-FPGA: llvm-foreach{{.*}} "--out-file-list=[[OUTPUT6:.+\.txt]]{{.*}} "--" "{{.*}}aoc{{.*}} "-o" "[[OUTPUT6]]" "[[OUTPUT5]]"
@@ -271,13 +271,33 @@
271271
// CHK-PHASE-MULTI-TARG: 36: clang-offload-wrapper, {35}, object, (device-sycl)
272272
// CHK-PHASE-MULTI-TARG: 37: offload, "host-sycl (x86_64-unknown-linux-gnu)" {9}, "device-sycl (spir64-unknown-unknown-sycldevice)" {18}, "device-sycl (spir64_fpga-unknown-unknown-sycldevice)" {28}, "device-sycl (spir64_gen-unknown-unknown-sycldevice)" {36}, image
273273

274-
// Check -fsycl-one-kernel-per-module option passing.
274+
// Check -fsycl-device-code-split=per_kernel option passing.
275275
// RUN: %clang -### -fsycl -fsycl-device-code-split=per_kernel %s 2>&1 \
276276
// RUN: | FileCheck %s -check-prefixes=CHK-ONE-KERNEL
277277
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=per_kernel %s 2>&1 \
278278
// RUN: | FileCheck %s -check-prefixes=CHK-ONE-KERNEL
279279
// CHK-ONE-KERNEL: sycl-post-link{{.*}} "-split=kernel"{{.*}} "-o"{{.*}}
280280

281+
// Check -fsycl-device-code-split=per_source option passing.
282+
// RUN: %clang -### -fsycl -fsycl-device-code-split=per_source %s 2>&1 \
283+
// RUN: | FileCheck %s -check-prefixes=CHK-PER-SOURCE
284+
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=per_source %s 2>&1 \
285+
// RUN: | FileCheck %s -check-prefixes=CHK-PER-SOURCE
286+
// CHK-PER-SOURCE: sycl-post-link{{.*}} "-split=source"{{.*}} "-o"{{.*}}
287+
288+
// Check -fsycl-device-code-split option passing.
289+
// RUN: %clang -### -fsycl -fsycl-device-code-split %s 2>&1 \
290+
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
291+
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split %s 2>&1 \
292+
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
293+
// RUN: %clang -### -fsycl -fsycl-device-code-split=auto %s 2>&1 \
294+
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
295+
// RUN: %clang_cl -### -fsycl -fsycl-device-code-split=auto %s 2>&1 \
296+
// RUN: | FileCheck %s -check-prefixes=CHK-AUTO
297+
// RUN: %clang -### -fsycl %s 2>&1 | FileCheck %s -check-prefixes=CHK-AUTO
298+
// RUN: %clang_cl -### -fsycl %s 2>&1 | FileCheck %s -check-prefixes=CHK-AUTO
299+
// CHK-AUTO: sycl-post-link{{.*}} "-split=auto"{{.*}} "-o"{{.*}}
300+
281301
// Check no device code split mode.
282302
// RUN: %clang -### -fsycl -fsycl-device-code-split -fsycl-device-code-split=off %s 2>&1 \
283303
// RUN: | FileCheck %s -check-prefixes=CHK-NO-SPLIT
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
2+
; By default auto mode is equal to source mode
3+
; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK
4+
; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK
5+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
6+
; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
7+
8+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
9+
target triple = "spir64-unknown-linux-sycldevice"
10+
11+
$_Z3barIiET_S0_ = comdat any
12+
13+
; CHECK-TU0-NOT: @{{.*}}GV{{.*}}
14+
; CHECK-TU1: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4
15+
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
16+
17+
; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
18+
; CHECK-TU0-TXT: {{.*}}TU0_kernel0{{.*}}
19+
; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
20+
; CHECK-TU1-TXT-NOT: {{.*}}TU0_kernel0{{.*}}
21+
22+
; CHECK-TU0: call spir_func void @{{.*}}foo{{.*}}()
23+
24+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
25+
entry:
26+
call spir_func void @_Z3foov()
27+
ret void
28+
}
29+
30+
; CHECK-TU0: define dso_local spir_func void @{{.*}}foo{{.*}}()
31+
; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo{{.*}}()
32+
33+
; CHECK-TU0: call spir_func i32 @{{.*}}bar{{.*}}(i32 1)
34+
35+
define dso_local spir_func void @_Z3foov() {
36+
entry:
37+
%a = alloca i32, align 4
38+
%call = call spir_func i32 @_Z3barIiET_S0_(i32 1)
39+
%add = add nsw i32 2, %call
40+
store i32 %add, i32* %a, align 4
41+
ret void
42+
}
43+
44+
; CHECK-TU0: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
45+
; CHECK-TU1-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
46+
47+
; Function Attrs: nounwind
48+
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
49+
entry:
50+
%arg.addr = alloca i32, align 4
51+
store i32 %arg, i32* %arg.addr, align 4
52+
%0 = load i32, i32* %arg.addr, align 4
53+
ret i32 %0
54+
}
55+
56+
; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
57+
; CHECK-TU0-TXT: {{.*}}TU0_kernel1{{.*}}
58+
; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
59+
; CHECK-TU1-TXT-NOT: {{.*}}TU0_kernel1{{.*}}
60+
61+
; CHECK-TU0: call spir_func void @{{.*}}foo1{{.*}}()
62+
63+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
64+
entry:
65+
call spir_func void @_Z4foo1v()
66+
ret void
67+
}
68+
69+
; CHECK-TU0: define dso_local spir_func void @{{.*}}foo1{{.*}}()
70+
; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo1{{.*}}()
71+
72+
; Function Attrs: nounwind
73+
define dso_local spir_func void @_Z4foo1v() {
74+
entry:
75+
%a = alloca i32, align 4
76+
store i32 2, i32* %a, align 4
77+
ret void
78+
}
79+
80+
; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
81+
; CHECK-TU0-TXT-NOT: {{.*}}TU1_kernel{{.*}}
82+
; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
83+
; CHECK-TU1-TXT: {{.*}}TU1_kernel{{.*}}
84+
85+
; CHECK-TU1: call spir_func void @{{.*}}foo2{{.*}}()
86+
87+
define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
88+
entry:
89+
call spir_func void @_Z4foo2v()
90+
ret void
91+
}
92+
93+
; CHECK-TU0-NOT: define dso_local spir_func void @{{.*}}foo2{{.*}}()
94+
; CHECK-TU1: define dso_local spir_func void @{{.*}}foo2{{.*}}()
95+
96+
; Function Attrs: nounwind
97+
define dso_local spir_func void @_Z4foo2v() {
98+
entry:
99+
%a = alloca i32, align 4
100+
; CHECK-TU1: %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @{{.*}}GV{{.*}} to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
101+
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
102+
%add = add nsw i32 4, %0
103+
store i32 %add, i32* %a, align 4
104+
ret void
105+
}
106+
107+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
108+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
109+
110+
; Metadata is saved in both modules.
111+
; CHECK: !opencl.spir.version = !{!0, !0}
112+
; CHECK: !spirv.Source = !{!1, !1}
113+
114+
!opencl.spir.version = !{!0, !0}
115+
!spirv.Source = !{!1, !1}
116+
117+
; CHECK: !0 = !{i32 1, i32 2}
118+
; CHECK: !1 = !{i32 4, i32 100000}
119+
120+
!0 = !{i32 1, i32 2}
121+
!1 = !{i32 4, i32 100000}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
2+
; In precense of indirectly callable function auto mode is equal to no split,
3+
; which means that separate LLVM IR file for device is not generated and we only
4+
; need to check generated symbol table
5+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK
6+
7+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
8+
target triple = "spir64-unknown-linux-sycldevice"
9+
10+
$_Z3barIiET_S0_ = comdat any
11+
12+
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
13+
14+
; CHECK: {{.*}}TU0_kernel0{{.*}}
15+
16+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
17+
entry:
18+
call spir_func void @_Z3foov()
19+
ret void
20+
}
21+
22+
define dso_local spir_func void @_Z3foov() #2 {
23+
entry:
24+
%a = alloca i32, align 4
25+
%call = call spir_func i32 @_Z3barIiET_S0_(i32 1)
26+
%add = add nsw i32 2, %call
27+
store i32 %add, i32* %a, align 4
28+
ret void
29+
}
30+
31+
; Function Attrs: nounwind
32+
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
33+
entry:
34+
%arg.addr = alloca i32, align 4
35+
store i32 %arg, i32* %arg.addr, align 4
36+
%0 = load i32, i32* %arg.addr, align 4
37+
ret i32 %0
38+
}
39+
40+
; CHECK: {{.*}}TU0_kernel1{{.*}}
41+
42+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
43+
entry:
44+
call spir_func void @_Z4foo1v()
45+
ret void
46+
}
47+
48+
; Function Attrs: nounwind
49+
define dso_local spir_func void @_Z4foo1v() {
50+
entry:
51+
%a = alloca i32, align 4
52+
store i32 2, i32* %a, align 4
53+
ret void
54+
}
55+
; CHECK: {{.*}}TU1_kernel{{.*}}
56+
57+
define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
58+
entry:
59+
call spir_func void @_Z4foo2v()
60+
ret void
61+
}
62+
63+
; Function Attrs: nounwind
64+
define dso_local spir_func void @_Z4foo2v() {
65+
entry:
66+
%a = alloca i32, align 4
67+
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
68+
%add = add nsw i32 4, %0
69+
store i32 %add, i32* %a, align 4
70+
ret void
71+
}
72+
73+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
74+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
75+
attributes #2 = { "referenced-indirectly" }
76+
77+
!opencl.spir.version = !{!0, !0}
78+
!spirv.Source = !{!1, !1}
79+
80+
!0 = !{i32 1, i32 2}
81+
!1 = !{i32 4, i32 100000}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table
2+
; In precense of indirect calls auto mode is equal to no split,
3+
; which means that separate LLVM IR file for device is not generated and we only
4+
; need to check generated symbol table
5+
; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK
6+
7+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
8+
target triple = "spir64-unknown-linux-sycldevice"
9+
10+
$_Z3barIiET_S0_ = comdat any
11+
12+
@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
13+
14+
; CHECK: {{.*}}TU0_kernel0{{.*}}
15+
16+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
17+
entry:
18+
call spir_func void @_Z3foov()
19+
ret void
20+
}
21+
22+
define dso_local spir_func void @_Z3foov() {
23+
entry:
24+
%a = alloca i32, align 4
25+
%ptr = bitcast i32* %a to i32 (i32)*
26+
%call = call spir_func i32 %ptr(i32 1)
27+
%add = add nsw i32 2, %call
28+
store i32 %add, i32* %a, align 4
29+
ret void
30+
}
31+
32+
; Function Attrs: nounwind
33+
define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
34+
entry:
35+
%arg.addr = alloca i32, align 4
36+
store i32 %arg, i32* %arg.addr, align 4
37+
%0 = load i32, i32* %arg.addr, align 4
38+
ret i32 %0
39+
}
40+
41+
; CHECK: {{.*}}TU0_kernel1{{.*}}
42+
43+
define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
44+
entry:
45+
call spir_func void @_Z4foo1v()
46+
ret void
47+
}
48+
49+
; Function Attrs: nounwind
50+
define dso_local spir_func void @_Z4foo1v() {
51+
entry:
52+
%a = alloca i32, align 4
53+
store i32 2, i32* %a, align 4
54+
ret void
55+
}
56+
; CHECK: {{.*}}TU1_kernel{{.*}}
57+
58+
define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
59+
entry:
60+
call spir_func void @_Z4foo2v()
61+
ret void
62+
}
63+
64+
; Function Attrs: nounwind
65+
define dso_local spir_func void @_Z4foo2v() {
66+
entry:
67+
%a = alloca i32, align 4
68+
%0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
69+
%add = add nsw i32 4, %0
70+
store i32 %add, i32* %a, align 4
71+
ret void
72+
}
73+
74+
attributes #0 = { "sycl-module-id"="TU1.cpp" }
75+
attributes #1 = { "sycl-module-id"="TU2.cpp" }
76+
77+
!opencl.spir.version = !{!0, !0}
78+
!spirv.Source = !{!1, !1}
79+
80+
!0 = !{i32 1, i32 2}
81+
!1 = !{i32 4, i32 100000}

llvm/test/tools/sycl-post-link/help.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,5 @@ CHECK: =default - set spec constants to C++ defaults
5252
CHECK: --split=<value> - split input module
5353
CHECK: =source - 1 output module per source (translation unit)
5454
CHECK: =kernel - 1 output module per kernel
55+
CHECK: =auto - Choose split mode automatically
5556
CHECK: --symbols - generate exported symbol files

llvm/tools/sycl-post-link/sycl-post-link.cpp

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -295,20 +295,33 @@ static KernelMapEntryScope selectDeviceCodeSplitScopeAutomatically(Module &M) {
295295
// Here we can employ various heuristics to decide which way to split kernels
296296
// is the best in each particular situation.
297297
// At the moment, we assume that per-kernel split is the best way of splitting
298-
// device code and it can be always selected unless there are functions marked
299-
// with [[intel::device_indirectly_callable]] attribute, because it instructs
300-
// us to make this function available to the whole program as it was compiled
301-
// as a single module.
298+
// device code and it can be always selected unless:
299+
// - there are functions marked with [[intel::device_indirectly_callable]]
300+
// attribute, because it instructs us to make this function available to the
301+
// whole program as it was compiled as a single module.
302+
// - there are indirect calls in the module, which means that we don't know
303+
// how to group functions so both caller and callee of indirect call are in
304+
// the same module.
302305
if (IROutputOnly) {
303306
// We allow enabling auto split mode even in presence of -ir-output-only
304307
// flag, but in this case we are limited by it so we can't do any split at
305308
// all.
306309
return Scope_Global;
307310
}
308311

309-
for (auto &F : M.functions()) {
312+
for (const auto &F : M.functions()) {
310313
if (F.hasFnAttribute("referenced-indirectly"))
311314
return Scope_Global;
315+
if (F.isDeclaration())
316+
continue;
317+
for (const auto &BB: F) {
318+
for (const auto &I : BB) {
319+
if (auto *CI = dyn_cast<CallInst>(&I)) {
320+
if (!CI->getCalledFunction())
321+
return Scope_Global;
322+
}
323+
}
324+
}
312325
}
313326

314327
return Scope_PerModule;

0 commit comments

Comments
 (0)