Skip to content

Commit f7d02db

Browse files
committed
[SYCL] Enable device code split by default
This patch introduces new device code split mode `auto`, which is intended to automatically select the best device code split mode and apply it. At the moment, `auto` is equivalent to `per_source` for most cases and it is equivalent to `off` in case of precense of function pointers.
1 parent d44aa3f commit f7d02db

File tree

3 files changed

+43
-12
lines changed

3 files changed

+43
-12
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1987,11 +1987,11 @@ def fsycl_link_targets_EQ : CommaJoined<["-"], "fsycl-link-targets=">, Flags<[No
19871987
HelpText<"Specify comma-separated list of triples SYCL offloading targets to produce linked device images">;
19881988
def fsycl_device_code_split_EQ : Joined<["-"], "fsycl-device-code-split=">,
19891989
Flags<[CC1Option, CoreOption]>, HelpText<"Perform SYCL device code split: per_kernel (device code module is "
1990-
"created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). "
1991-
"Default is 'off' - all kernels go into a single module`">, Values<"per_source, per_kernel, off">;
1990+
"created for each SYCL kernel) | per_source (device code module is created for each source (translation unit)) | off (no device code split). | auto (use heuristic to select the best way of splitting device code)"
1991+
"Default is 'auto' - automatically select how to split device code into modules">, Values<"per_source, per_kernel, off, auto">;
19921992
def fsycl_device_code_split : Flag<["-"], "fsycl-device-code-split">, Alias<fsycl_device_code_split_EQ>,
1993-
AliasArgs<["per_source"]>, Flags<[CC1Option, CoreOption]>,
1994-
HelpText<"Perform SYCL device code split in the per_source mode i.e. create a device code module for each source (translation unit)">;
1993+
AliasArgs<["auto"]>, Flags<[CC1Option, CoreOption]>,
1994+
HelpText<"Perform SYCL device code split in the 'auto' mode i.e. use heuristic to distribute device code across modules">;
19951995
def fsycl_id_queries_fit_in_int : Flag<["-"], "fsycl-id-queries-fit-in-int">,
19961996
Flags<[CC1Option, CoreOption]>, HelpText<"Assume that SYCL ID queries fit "
19971997
"within MAX_INT.">;

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8142,12 +8142,17 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA,
81428142
addArgs(CmdArgs, TCArgs, {"-split=kernel"});
81438143
else if (StringRef(A->getValue()) == "per_source")
81448144
addArgs(CmdArgs, TCArgs, {"-split=source"});
8145+
else if (StringRef(A->getValue()) == "auto")
8146+
addArgs(CmdArgs, TCArgs, {"-split=auto"});
81458147
else
81468148
// split must be off
81478149
assert(StringRef(A->getValue()) == "off");
8150+
} else {
8151+
// auto is the default split mode
8152+
addArgs(CmdArgs, TCArgs, {"-split=auto"});
81488153
}
81498154
// OPT_fsycl_device_code_split is not checked as it is an alias to
8150-
// -fsycl-device-code-split=per_source
8155+
// -fsycl-device-code-split=auto
81518156

81528157
// Turn on Dead Parameter Elimination Optimization with early optimizations
81538158
if (!getToolChain().getTriple().isNVPTX() &&

llvm/tools/sycl-post-link/sycl-post-link.cpp

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,16 +90,18 @@ static cl::opt<bool> OutputAssembly{"S",
9090

9191
enum IRSplitMode {
9292
SPLIT_PER_TU, // one module per translation unit
93-
SPLIT_PER_KERNEL // one module per kernel
93+
SPLIT_PER_KERNEL, // one module per kernel
94+
SPLIT_AUTO // automatically select split mode
9495
};
9596

9697
static cl::opt<IRSplitMode> SplitMode(
9798
"split", cl::desc("split input module"), cl::Optional,
9899
cl::init(SPLIT_PER_TU),
99-
cl::values(clEnumValN(SPLIT_PER_TU, "source",
100-
"1 output module per source (translation unit)"),
101-
clEnumValN(SPLIT_PER_KERNEL, "kernel",
102-
"1 output module per kernel")),
100+
cl::values(
101+
clEnumValN(SPLIT_PER_TU, "source",
102+
"1 output module per source (translation unit)"),
103+
clEnumValN(SPLIT_PER_KERNEL, "kernel", "1 output module per kernel"),
104+
clEnumValN(SPLIT_AUTO, "auto", "Choose split mode automatically")),
103105
cl::cat(PostLinkCat));
104106

105107
static cl::opt<bool> DoSymGen{"symbols",
@@ -289,6 +291,25 @@ enum KernelMapEntryScope {
289291
Scope_Global // single entry in the map for all kernels
290292
};
291293

294+
static KernelMapEntryScope selectDeviceCodeSplitModeAutomatically(Module &M) {
295+
// Here we can employ various heuristics to decide which way to split kernels
296+
// is the best in each particular situation.
297+
// At the moment, we assume that per-kernel split is the best way of splitting
298+
// device code and it can be always selected unless there are functions marked
299+
// with [[intel::device_indirectly_callable]] attribute, because it instructs
300+
// us to make this function available to the whole program as it was compiled
301+
// as a single module.
302+
bool HasDeviceIndirectlyCallable = false;
303+
for (auto &F : M.functions()) {
304+
if (F.hasFnAttribute("referenced-indirectly"))
305+
HasDeviceIndirectlyCallable = true;
306+
}
307+
308+
if (HasDeviceIndirectlyCallable)
309+
return Scope_Global;
310+
return Scope_PerModule;
311+
}
312+
292313
// This function decides how kernels of the input module M will be distributed
293314
// ("split") into multiple modules based on the command options and IR
294315
// attributes. The decision is recorded in the output map parameter
@@ -656,8 +677,13 @@ int main(int argc, char **argv) {
656677

657678
if (DoSplit || DoSymGen) {
658679
KernelMapEntryScope Scope = Scope_Global;
659-
if (DoSplit)
660-
Scope = SplitMode == SPLIT_PER_KERNEL ? Scope_PerKernel : Scope_PerModule;
680+
if (DoSplit) {
681+
if (SplitMode == SPLIT_AUTO)
682+
Scope = selectDeviceCodeSplitModeAutomatically(*MPtr);
683+
else
684+
Scope =
685+
SplitMode == SPLIT_PER_KERNEL ? Scope_PerKernel : Scope_PerModule;
686+
}
661687
collectKernelModuleMap(*MPtr, GlobalsSet, Scope);
662688
}
663689

0 commit comments

Comments
 (0)