Skip to content

Commit 37a7b45

Browse files
[SYCL][CUDA] Add a CUDA compatibilty mode (#12757)
This patch enables CUDA mode at the same time as the SYCL mode. This allows SYCL code to interact with CUDA code more closely: - A user can call a CUDA device function from a SYCL device one (follow up of #7352) - The PR fixes overload resolution as the resolution ranking was ambiguous in some cases - The PR fixes error reporting, some cuda specific delayed diags weren't reported (filtered out) - Defines `__CUDA_ARCH__`, enabling functions to assume NVPTX is the target To enable the mode the user adds -fsycl-cuda-compat to the command line. By default this mode is set to off. The flag is only used for the NVPTX backend. The intent is to help to transition from CUDA to SYCL. Using this mode enable a SYCL application to reuse CUDA functionalities, especially fast paths that are guarded by `__CUDA_ARCH__`. --------- Signed-off-by: Victor Lomuller <[email protected]> Co-authored-by: Tom Honermann <[email protected]>
1 parent b23d69e commit 37a7b45

25 files changed

+447
-47
lines changed

clang/include/clang/Basic/LangOptions.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,10 @@ LANGOPT(
318318
"SYCL compiler assumes value fits within MAX_INT for member function of "
319319
"get/operator[], get_id/operator[] and get_global_id/get_global_linear_id "
320320
"in SYCL class id, iterm and nd_iterm")
321+
LANGOPT(SYCLCUDACompat, 1, 0,
322+
"Enable CUDA definitions and implicit includes when building for the "
323+
"NVPTX backend. This mode can help SYCL program to run using the CUDA "
324+
"infrastructure on Nvidia's platforms. ")
321325
ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2,
322326
SYCLRangeRoundingPreference::On,
323327
"Preference for SYCL parallel_for range rounding")

clang/include/clang/Driver/Options.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7031,6 +7031,13 @@ defm sycl_decompose_functor
70317031
NegFlag<SetFalse, [], [ClangOption, CLOption], "Do not">,
70327032
BothFlags<[], [ClangOption, CLOption, CC1Option],
70337033
" decompose SYCL functor if possible (experimental, CUDA only)">>;
7034+
defm sycl_cuda_compat
7035+
: BoolFOption<"sycl-cuda-compatibility", LangOpts<"SYCLCUDACompat">, DefaultFalse,
7036+
PosFlag<SetTrue, [], [ClangOption, CLOption, CC1Option], "Enable CUDA compatibility mode (experimental). "
7037+
"Enable the use of CUDA device code with SYCL device code. "
7038+
"Under this mode, a SYCL device function can call a CUDA device function (but not the other way around). "
7039+
"This implies the definition of CUDA macros and the inclusion of implicit header files.">,
7040+
NegFlag<SetFalse, [], [ClangOption, CLOption, CC1Option], "Disable CUDA compatibility mode.">>;
70347041
def flink_huge_device_code : Flag<["-"], "flink-huge-device-code">,
70357042
HelpText<"Generate and use a custom linker script for huge device code "
70367043
"sections">;

clang/include/clang/Sema/SemaBase.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class SemaBase {
110110
CudaAll = CudaDevice | CudaHost,
111111
/// SYCL specific diagnostic.
112112
Sycl = 1 << 4,
113+
SyclCudaCompat = Sycl | CudaAll,
113114
/// ESIMD specific diagnostic.
114115
Esimd = 1 << 5,
115116
/// A flag representing 'all'. This can be used to avoid the check

clang/include/clang/Sema/SemaCUDA.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ class SemaCUDA : public SemaBase {
157157

158158
// CUDA function call preference. Must be ordered numerically from
159159
// worst to best.
160+
// Note: in SYCL-CUDA compatibility mode: Native, SameSide and HostDevice
161+
// doesn't follow the naming, only the ranking system (e.g. 1st, 2nd or 3rd
162+
// choice). See table near IdentifyPreference.
160163
enum CUDAFunctionPreference {
161164
CFP_Never, // Invalid caller/callee combination.
162165
CFP_WrongSide, // Calls from host-device to host or device

clang/lib/Basic/LangOptions.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,
183183
}
184184

185185
Opts.HIP = Lang == Language::HIP;
186-
Opts.CUDA = Lang == Language::CUDA || Opts.HIP;
186+
Opts.CUDA = Lang == Language::CUDA || Opts.HIP || Opts.SYCLCUDACompat;
187187
if (Opts.HIP) {
188188
// HIP toolchain does not support 'Fast' FPOpFusion in backends since it
189189
// fuses multiplication/addition instructions without contract flag from

clang/lib/Basic/Targets/NVPTX.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -294,11 +294,13 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
294294
llvm_unreachable("unhandled OffloadArch");
295295
}();
296296

297-
if (Opts.SYCLIsDevice) {
297+
if (Opts.SYCLIsDevice)
298298
Builder.defineMacro("__SYCL_CUDA_ARCH__", CUDAArchCode);
299-
} else {
299+
// Don't define __CUDA_ARCH__ if in SYCL device mode unless we are in
300+
// SYCL-CUDA compatibility mode.
301+
// For all other cases, define the macro.
302+
if (!Opts.SYCLIsDevice || Opts.SYCLCUDACompat)
300303
Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
301-
}
302304
if (GPU == OffloadArch::SM_90a)
303305
Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
304306
if (GPU == OffloadArch::SM_100a)

clang/lib/CodeGen/CodeGenFunction.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1858,16 +1858,6 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
18581858
if (Body && isa_and_nonnull<CoroutineBodyStmt>(Body))
18591859
llvm::append_range(FnArgs, FD->parameters());
18601860

1861-
// Generate a dummy __host__ function for compiling CUDA sources in SYCL.
1862-
if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
1863-
getLangOpts().SYCLIsHost && !FD->hasAttr<CUDAHostAttr>() &&
1864-
FD->hasAttr<CUDADeviceAttr>()) {
1865-
if (FD->getReturnType()->isVoidType())
1866-
Builder.CreateRetVoid();
1867-
else
1868-
Builder.CreateRet(llvm::UndefValue::get(Fn->getReturnType()));
1869-
return;
1870-
}
18711861
// When compiling a CUDA file in SYCL device mode,
18721862
// set weak ODR linkage for possibly duplicated functions.
18731863
if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
@@ -1884,7 +1874,22 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
18841874

18851875
// Generate the body of the function.
18861876
PGO.assignRegionCounters(GD, CurFn);
1887-
if (isa<CXXDestructorDecl>(FD))
1877+
if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
1878+
getLangOpts().SYCLIsHost && !FD->hasAttr<CUDAHostAttr>() &&
1879+
FD->hasAttr<CUDADeviceAttr>()) {
1880+
// SYCL host compilation with CUDA compatibility enabled requires
1881+
// the creation of a host stub function for functions declared with
1882+
// the __device__ specifier but without the __host__ specifier.
1883+
// This is caused by the fact that SYCL doesn't use specifier like CUDA and
1884+
// so may have what can appear to be call from host to device. As we can't
1885+
// prevent the emission of such call, we need to produce a symbol for
1886+
// function with the __device__.
1887+
if (FD->getReturnType()->isVoidType())
1888+
Builder.CreateRetVoid();
1889+
else
1890+
Builder.CreateRet(llvm::UndefValue::get(Fn->getReturnType()));
1891+
Builder.ClearInsertionPoint();
1892+
} else if (isa<CXXDestructorDecl>(FD))
18881893
EmitDestructorBody(Args);
18891894
else if (isa<CXXConstructorDecl>(FD))
18901895
EmitConstructorBody(Args);

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ using namespace clang::driver::tools;
7575
using namespace clang;
7676
using namespace llvm::opt;
7777

78+
static bool isSYCLCudaCompatEnabled(const ArgList &Args) {
79+
return Args.hasFlag(options::OPT_fsycl_cuda_compat,
80+
options::OPT_fno_sycl_cuda_compat, false);
81+
}
82+
7883
static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
7984
if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC,
8085
options::OPT_fminimize_whitespace,
@@ -1176,7 +1181,8 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
11761181

11771182
if (JA.isOffloading(Action::OFK_SYCL)) {
11781183
getToolChain().addSYCLIncludeArgs(Args, CmdArgs);
1179-
if (Inputs[0].getType() == types::TY_CUDA) {
1184+
if (Inputs[0].getType() == types::TY_CUDA ||
1185+
isSYCLCudaCompatEnabled(Args)) {
11801186
// Include __clang_cuda_runtime_wrapper.h in .cu SYCL compilation.
11811187
getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
11821188
}
@@ -5463,6 +5469,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
54635469
bool IsFPGASYCLOffloadDevice =
54645470
IsSYCLDevice && Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga;
54655471
const bool IsSYCLNativeCPU = isSYCLNativeCPU(TC);
5472+
const bool IsSYCLCUDACompat = isSYCLCudaCompatEnabled(Args);
54665473

54675474
// Perform the SYCL host compilation using an external compiler if the user
54685475
// requested.
@@ -5832,6 +5839,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
58325839
CmdArgs.push_back("-fno-sycl-esimd-build-host-code");
58335840
}
58345841

5842+
if (IsSYCLCUDACompat) {
5843+
Args.addOptInFlag(CmdArgs, options::OPT_fsycl_cuda_compat,
5844+
options::OPT_fno_sycl_cuda_compat);
5845+
// FIXME: clang's CUDA headers require this ...
5846+
// remove when clang/lib/Headers/__clang_cuda_builtin_vars.h no longer
5847+
// requires it.
5848+
CmdArgs.push_back("-fdeclspec");
5849+
// Note: assumes CUDA 9.0 or more (required by SYCL for CUDA)
5850+
CmdArgs.push_back("-fcuda-allow-variadic-functions");
5851+
}
5852+
58355853
// Set options for both host and device
58365854
if (SYCLStdArg) {
58375855
SYCLStdArg->render(Args, CmdArgs);
@@ -5898,6 +5916,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
58985916
bool HasFPGA = false;
58995917
for (auto TI = SYCLTCRange.first, TE = SYCLTCRange.second; TI != TE; ++TI) {
59005918
llvm::Triple SYCLTriple = TI->second->getTriple();
5919+
if (SYCLTriple.isNVPTX() && IsSYCLCUDACompat && !IsSYCLDevice) {
5920+
CmdArgs.push_back("-aux-triple");
5921+
CmdArgs.push_back(Args.MakeArgString(SYCLTriple.normalize()));
5922+
// We need to figure out which CUDA version we're compiling for, as that
5923+
// determines how we load and launch GPU kernels.
5924+
auto *CTC = static_cast<const toolchains::CudaToolChain *>(TI->second);
5925+
assert(CTC && "Expected valid CUDA Toolchain.");
5926+
if (CTC->CudaInstallation.version() != CudaVersion::UNKNOWN)
5927+
CmdArgs.push_back(Args.MakeArgString(
5928+
Twine("-target-sdk-version=") +
5929+
CudaVersionToString(CTC->CudaInstallation.version())));
5930+
break;
5931+
}
59015932
if (SYCLTriple.getSubArch() == llvm::Triple::SPIRSubArch_fpga) {
59025933
HasFPGA = true;
59035934
if (!IsSYCLDevice) {

clang/lib/Frontend/CompilerInvocation.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4198,6 +4198,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
41984198
Opts.IncludeDefaultHeader = Args.hasArg(OPT_finclude_default_header);
41994199
Opts.DeclareOpenCLBuiltins = Args.hasArg(OPT_fdeclare_opencl_builtins);
42004200

4201+
Opts.SYCLCUDACompat =
4202+
Args.hasArg(OPT_fsycl_cuda_compat, OPT_fno_sycl_cuda_compat, false);
4203+
42014204
LangOptions::setLangDefaults(Opts, IK.getLanguage(), T, Includes, LangStd);
42024205

42034206
// The key paths of codegen options defined in Options.td start with

clang/lib/Frontend/InitPreprocessor.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1511,10 +1511,15 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
15111511
}
15121512

15131513
// CUDA device path compilaton
1514-
if (LangOpts.CUDAIsDevice && !LangOpts.HIP && !LangOpts.isSYCL()) {
1514+
// Enabled if CUDA device compilation mode is on unless HIP is
1515+
// active or SYCL is active without CUDA compatibility enabled.
1516+
bool EnableCUDADevicePath = LangOpts.CUDAIsDevice && !LangOpts.HIP &&
1517+
(!LangOpts.isSYCL() || LangOpts.SYCLCUDACompat);
1518+
if (EnableCUDADevicePath) {
15151519
// The CUDA_ARCH value is set for the GPU target specified in the NVPTX
15161520
// backend's target defines.
1517-
// Note: SYCL targeting nvptx-cuda relies on __SYCL_CUDA_ARCH__ instead.
1521+
// Note: SYCL targeting nvptx-cuda without SYCL-CUDA compatibility relies on
1522+
// __SYCL_CUDA_ARCH__ only instead.
15181523
Builder.defineMacro("__CUDA_ARCH__");
15191524
}
15201525

clang/lib/Sema/Sema.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2093,9 +2093,19 @@ Sema::targetDiag(SourceLocation Loc, unsigned DiagID, const FunctionDecl *FD) {
20932093
return LangOpts.OpenMPIsTargetDevice
20942094
? OpenMP().diagIfOpenMPDeviceCode(Loc, DiagID, FD)
20952095
: OpenMP().diagIfOpenMPHostCode(Loc, DiagID, FD);
2096-
if (getLangOpts().CUDA)
2096+
2097+
// If SYCLCUDACompat is active, use the SYCL logic instead of CUDA when
2098+
// compiling the device side but the CUDA logic when compiling the host side.
2099+
// When compiling the device side, we need this as CUDA looks for the presence
2100+
// of __device__, __host__ etc. attributes to emit or defer diagnostics. These
2101+
// aren't always there as SYCL doesn't use such attribute.
2102+
if (getLangOpts().CUDA && !getLangOpts().SYCLCUDACompat)
20972103
return getLangOpts().CUDAIsDevice ? CUDA().DiagIfDeviceCode(Loc, DiagID)
20982104
: CUDA().DiagIfHostCode(Loc, DiagID);
2105+
// On the host side, __device__ acts as a guard like __SYCL_DEVICE_ONLY__
2106+
// macro, so use the CUDA logic here.
2107+
if (getLangOpts().SYCLIsHost && getLangOpts().SYCLCUDACompat)
2108+
return CUDA().DiagIfHostCode(Loc, DiagID);
20992109

21002110
if (getLangOpts().SYCLIsDevice)
21012111
return SYCL().DiagIfDeviceCode(Loc, DiagID);

clang/lib/Sema/SemaCUDA.cpp

Lines changed: 54 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -222,11 +222,20 @@ SemaCUDA::CUDAVariableTarget SemaCUDA::IdentifyTarget(const VarDecl *Var) {
222222
// | hd | hd | HD | HD | (b) |
223223
//
224224
// In combined SYCL - CUDA mode
225-
// Sh - SYCL is host
226-
// Sd - SYCL is device
225+
// Sh - SYCL is host (SYCLIsDevice == false and SYCLIsHost == true)
226+
// Sd - SYCL is device (SYCLIsDevice == true and SYCLIsHost == false)
227227
//
228228
// Priority order: N, SS, HD, WS, --
229229
//
230+
// Note: we deviate from the actual meaning for
231+
// N, SS, HD, WS, --.
232+
// Wrong side (WS) and -- (Never) are still used to raise error (delayed and
233+
// immediate respectively). Native (N), SameSide (SS) and HostDevice (HD) are
234+
// used to rank preference as 1st, 2nd or 3rd choice (N > SS > HD) to determine
235+
// the best viable function.
236+
//
237+
// Extra (x) specifies an alternative handling location from the one in H.
238+
//
230239
// | | | host | cuda-dev | sycl-dev | |
231240
// | F | T | Ph - Sh | Pd - Sh | Ph - Sd | H |
232241
// |----+----+----------+------------+-----------+-----+
@@ -238,14 +247,14 @@ SemaCUDA::CUDAVariableTarget SemaCUDA::IdentifyTarget(const VarDecl *Var) {
238247
// | g | g | -- | -- | -- | (a) |
239248
// | g | h | -- | -- | -- | (e) |
240249
// | g | hd | HD | HD | HD | (c) |
241-
// | h | d | HD(y) | WS(v) | N(x) | ( ) |
250+
// | h | d | HD(y1)| WS(z) | N (x1)| ( ) |
242251
// | h | g | N | N | N | (c) |
243-
// | h | h | N | N | SS(p) | ( ) |
244-
// | h | hd | HD | HD | HD | ( ) |
245-
// | hd | d | HD(y) | SS | N(x) | ( ) |
246-
// | hd | g | SS | -- | --(z) |(d/a)|
247-
// | hd | h | SS | WS | SS | (d) |
248-
// | hd | hd | HD | HD | HD | (b) |
252+
// | h | h | N | N | SS(x2)| (c) |
253+
// | h | hd | SS(y5)| HD | HD | (b) |
254+
// | hd | d | HD(y3)| SS | N (x1)| (d) |
255+
// | hd | g | N (y2)| -- | --(x3)|(d/a)|
256+
// | hd | h | N (y2)| WS | HD(x4)| (d) |
257+
// | hd | hd | SS(y4)| HD | SS(x5)| (b) |
249258

250259
SemaCUDA::CUDAFunctionPreference
251260
SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
@@ -266,7 +275,7 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
266275
// Pd - Sh -> CUDA device compilation for SYCL+CUDA
267276
if (getLangOpts().SYCLIsHost && getLangOpts().CUDA &&
268277
getLangOpts().CUDAIsDevice) {
269-
// (v) allows a __host__ function to call a __device__ one. This is allowed
278+
// (z) allows a __host__ function to call a __device__ one. This is allowed
270279
// for sycl-device compilation, since a regular function (implicitly
271280
// __host__) called by a SYCL kernel could end up calling a __device__ one.
272281
// In any case, __host__ functions are not emitted by the cuda-dev
@@ -280,36 +289,59 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
280289
if (getLangOpts().SYCLIsDevice && getLangOpts().CUDA &&
281290
!getLangOpts().CUDAIsDevice) {
282291
// (x), and (p) prefer __device__ function in SYCL-device compilation.
283-
// (x) allows to pick a __device__ function.
292+
// (x1) allows to pick a __device__ function.
284293
if ((CallerTarget == CUDAFunctionTarget::Host ||
285294
CallerTarget == CUDAFunctionTarget::HostDevice) &&
286295
CalleeTarget == CUDAFunctionTarget::Device)
287296
return CFP_Native;
288-
// (p) lowers the preference of __host__ functions for favoring __device__
297+
// (x2) lowers the preference of __host__ functions for favoring __device__
289298
// ones.
290299
if (CallerTarget == CUDAFunctionTarget::Host &&
291300
CalleeTarget == CUDAFunctionTarget::Host)
292301
return CFP_SameSide;
293302

294-
// (z)
303+
// (x3)
295304
if (CallerTarget == CUDAFunctionTarget::HostDevice &&
296305
CalleeTarget == CUDAFunctionTarget::Global)
297306
return CFP_Never;
307+
// (x4)
308+
if (CallerTarget == CUDAFunctionTarget::HostDevice &&
309+
CalleeTarget == CUDAFunctionTarget::Host)
310+
return CFP_HostDevice;
311+
// (x5)
312+
if (CallerTarget == CUDAFunctionTarget::HostDevice &&
313+
CalleeTarget == CUDAFunctionTarget::HostDevice)
314+
return CFP_SameSide;
298315
}
299316

300-
// Ph - Sh -> host compilation for SYCL+CUDA
317+
// (y) Ph - Sh -> host compilation for SYCL+CUDA
301318
if (getLangOpts().SYCLIsHost && getLangOpts().CUDA &&
302319
!getLangOpts().CUDAIsDevice) {
303-
// (y) allows __host__ and __host__ __device__ functions to call a
304-
// __device__ one. This could happen, if a __device__ function is defined
305-
// without having a corresponding __host__. In this case, a dummy __host__
306-
// function is generated. This dummy function is required since the lambda
307-
// that forms the SYCL kernel (having host device attr.) needs to be
308-
// compiled also for the host. (CallerTarget == CUDAFunctionTarget::Host) is added in case a
309-
// regular function (implicitly __host__) is called by a SYCL kernel lambda.
310-
if ((CallerTarget == CUDAFunctionTarget::Host || CallerTarget == CUDAFunctionTarget::HostDevice) &&
320+
// In host mode, allows __host__ and __host__ __device__ functions
321+
// to call a __device__ one, but we shouldn't emit the call as __device__
322+
// functions are replaced with a trap. __host__ -> __device__ is normally
323+
// CFP_Never, but we need to make it a defer diagnostic.
324+
// (y1) h -> d
325+
if (CallerTarget == CUDAFunctionTarget::Host &&
311326
CalleeTarget == CUDAFunctionTarget::Device)
312327
return CFP_HostDevice;
328+
// (y2) hd -> h or hd ->g
329+
if (CallerTarget == CUDAFunctionTarget::HostDevice &&
330+
(CalleeTarget == CUDAFunctionTarget::Host ||
331+
CalleeTarget == CUDAFunctionTarget::Global))
332+
return CFP_Native;
333+
// (y3) hd -> d
334+
if (CallerTarget == CUDAFunctionTarget::HostDevice &&
335+
CalleeTarget == CUDAFunctionTarget::Device)
336+
return CFP_HostDevice;
337+
// (y4) hd -> hd
338+
if (CallerTarget == CUDAFunctionTarget::HostDevice &&
339+
CalleeTarget == CUDAFunctionTarget::HostDevice)
340+
return CFP_SameSide;
341+
// (y5) h -> hd
342+
if (CallerTarget == CUDAFunctionTarget::Host &&
343+
CalleeTarget == CUDAFunctionTarget::HostDevice)
344+
return CFP_SameSide;
313345
}
314346

315347
// If one of the targets is invalid, the check always fails, no matter what

clang/lib/Sema/SemaDecl.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20450,9 +20450,13 @@ Sema::DeviceDiagnosticReason Sema::getEmissionReason(const FunctionDecl *FD) {
2045020450
if (FD->hasAttr<SYCLSimdAttr>())
2045120451
return Sema::DeviceDiagnosticReason::Esimd;
2045220452
if (FD->hasAttr<SYCLDeviceAttr>() || FD->hasAttr<SYCLKernelAttr>())
20453-
return Sema::DeviceDiagnosticReason::Sycl;
20453+
return getLangOpts().SYCLCUDACompat
20454+
? Sema::DeviceDiagnosticReason::SyclCudaCompat
20455+
: Sema::DeviceDiagnosticReason::Sycl;
2045420456
// FIXME: Refine the logic for CUDA and OpenMP.
20455-
if (getLangOpts().CUDA)
20457+
// In SYCL-CUDA compat mode, don't return CudaDevice or CudaHost but return
20458+
// All just like in normal SYCL.
20459+
if (getLangOpts().CUDA && !getLangOpts().SYCLCUDACompat)
2045620460
return getLangOpts().CUDAIsDevice ? Sema::DeviceDiagnosticReason::CudaDevice
2045720461
: Sema::DeviceDiagnosticReason::CudaHost;
2045820462
if (getLangOpts().OpenMP)
@@ -20534,7 +20538,9 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(const FunctionDecl *FD,
2053420538
(T == CUDAFunctionTarget::Device || T == CUDAFunctionTarget::Global))
2053520539
return FunctionEmissionStatus::CUDADiscarded;
2053620540

20537-
if (IsEmittedForExternalSymbol())
20541+
// Defer to SYCLIsDevice if in cuda compat mode
20542+
if ((LangOpts.CUDAIsDevice || !LangOpts.SYCLCUDACompat) &&
20543+
IsEmittedForExternalSymbol())
2053820544
return FunctionEmissionStatus::Emitted;
2053920545
}
2054020546

clang/lib/Sema/SemaSYCL.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,12 @@ bool SemaSYCL::isDeclAllowedInSYCLDeviceCode(const Decl *D) {
397397
FD->getBuiltinID() == Builtin::BI__builtin_printf))
398398
return true;
399399

400+
// Allow to use `::printf` only for CUDA.
401+
if (getLangOpts().SYCLCUDACompat) {
402+
if (FD->getBuiltinID() == Builtin::BIprintf)
403+
return true;
404+
}
405+
400406
const DeclContext *DC = FD->getDeclContext();
401407
if (II && II->isStr("__spirv_ocl_printf") &&
402408
!FD->isDefined() &&

0 commit comments

Comments
 (0)