intel
diff --git a/‎clang/include/clang/Basic/LangOptions.def
Lines changed: 4 additions & 0 deletions b/‎clang/include/clang/Basic/LangOptions.def
Lines changed: 4 additions & 0 deletions
diff --git a/‎clang/include/clang/Driver/Options.td
Lines changed: 7 additions & 0 deletions b/‎clang/include/clang/Driver/Options.td
Lines changed: 7 additions & 0 deletions
diff --git a/‎clang/include/clang/Sema/SemaBase.h
Lines changed: 1 addition & 0 deletions b/‎clang/include/clang/Sema/SemaBase.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎clang/include/clang/Sema/SemaCUDA.h
Lines changed: 3 additions & 0 deletions b/‎clang/include/clang/Sema/SemaCUDA.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎clang/lib/Basic/LangOptions.cpp
Lines changed: 1 addition & 1 deletion b/‎clang/lib/Basic/LangOptions.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/lib/Basic/Targets/NVPTX.cpp
Lines changed: 5 additions & 3 deletions b/‎clang/lib/Basic/Targets/NVPTX.cpp
Lines changed: 5 additions & 3 deletions
diff --git a/‎clang/lib/CodeGen/CodeGenFunction.cpp
Lines changed: 16 additions & 11 deletions b/‎clang/lib/CodeGen/CodeGenFunction.cpp
Lines changed: 16 additions & 11 deletions
diff --git a/‎clang/lib/Driver/ToolChains/Clang.cpp
Lines changed: 32 additions & 1 deletion b/‎clang/lib/Driver/ToolChains/Clang.cpp
Lines changed: 32 additions & 1 deletion
diff --git a/‎clang/lib/Frontend/CompilerInvocation.cpp
Lines changed: 3 additions & 0 deletions b/‎clang/lib/Frontend/CompilerInvocation.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎clang/lib/Frontend/InitPreprocessor.cpp
Lines changed: 7 additions & 2 deletions b/‎clang/lib/Frontend/InitPreprocessor.cpp
Lines changed: 7 additions & 2 deletions
diff --git a/‎clang/lib/Sema/Sema.cpp
Lines changed: 11 additions & 1 deletion b/‎clang/lib/Sema/Sema.cpp
Lines changed: 11 additions & 1 deletion
diff --git a/‎clang/lib/Sema/SemaCUDA.cpp
Lines changed: 54 additions & 22 deletions b/‎clang/lib/Sema/SemaCUDA.cpp
Lines changed: 54 additions & 22 deletions
diff --git a/‎clang/lib/Sema/SemaDecl.cpp
Lines changed: 9 additions & 3 deletions b/‎clang/lib/Sema/SemaDecl.cpp
Lines changed: 9 additions & 3 deletions
diff --git a/‎clang/lib/Sema/SemaSYCL.cpp
Lines changed: 6 additions & 0 deletions b/‎clang/lib/Sema/SemaSYCL.cpp
Lines changed: 6 additions & 0 deletions
@@ -318,6 +318,10 @@ LANGOPT(
     "SYCL compiler assumes value fits within MAX_INT for member function of "
     "get/operator[], get_id/operator[] and get_global_id/get_global_linear_id "
     "in SYCL class id, iterm and nd_iterm")
+LANGOPT(SYCLCUDACompat, 1, 0,
+        "Enable CUDA definitions and implicit includes when building for the "
+        "NVPTX backend. This mode can help SYCL program to run using the CUDA "
+        "infrastructure on Nvidia's platforms. ")
 ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2,
     SYCLRangeRoundingPreference::On,
     "Preference for SYCL parallel_for range rounding")
 
@@ -7031,6 +7031,13 @@ defm sycl_decompose_functor
           NegFlag<SetFalse, [], [ClangOption, CLOption], "Do not">,
           BothFlags<[], [ClangOption, CLOption, CC1Option],
            " decompose SYCL functor if possible (experimental, CUDA only)">>;
+defm sycl_cuda_compat
+    : BoolFOption<"sycl-cuda-compatibility", LangOpts<"SYCLCUDACompat">, DefaultFalse,
+          PosFlag<SetTrue, [], [ClangOption, CLOption, CC1Option], "Enable CUDA compatibility mode (experimental). "
+          "Enable the use of CUDA device code with SYCL device code. "
+          "Under this mode, a SYCL device function can call a CUDA device function (but not the other way around). "
+          "This implies the definition of CUDA macros and the inclusion of implicit header files.">,
+          NegFlag<SetFalse, [], [ClangOption, CLOption, CC1Option], "Disable CUDA compatibility mode.">>;
 def flink_huge_device_code : Flag<["-"], "flink-huge-device-code">,
   HelpText<"Generate and use a custom linker script for huge device code "
            "sections">;
 
@@ -110,6 +110,7 @@ class SemaBase {
     CudaAll = CudaDevice | CudaHost,
     /// SYCL specific diagnostic.
     Sycl = 1 << 4,
+    SyclCudaCompat = Sycl | CudaAll,
     /// ESIMD specific diagnostic.
     Esimd = 1 << 5,
     /// A flag representing 'all'.  This can be used to avoid the check
 
@@ -157,6 +157,9 @@ class SemaCUDA : public SemaBase {
 
   // CUDA function call preference. Must be ordered numerically from
   // worst to best.
+  // Note: in SYCL-CUDA compatibility mode: Native, SameSide and HostDevice
+  // doesn't follow the naming, only the ranking system (e.g. 1st, 2nd or 3rd
+  // choice). See table near IdentifyPreference.
   enum CUDAFunctionPreference {
     CFP_Never,      // Invalid caller/callee combination.
     CFP_WrongSide,  // Calls from host-device to host or device
 
@@ -183,7 +183,7 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,
   }
 
   Opts.HIP = Lang == Language::HIP;
-  Opts.CUDA = Lang == Language::CUDA || Opts.HIP;
+  Opts.CUDA = Lang == Language::CUDA || Opts.HIP || Opts.SYCLCUDACompat;
   if (Opts.HIP) {
     // HIP toolchain does not support 'Fast' FPOpFusion in backends since it
     // fuses multiplication/addition instructions without contract flag from
 
@@ -294,11 +294,13 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       llvm_unreachable("unhandled OffloadArch");
     }();
 
-    if (Opts.SYCLIsDevice) {
+    if (Opts.SYCLIsDevice)
       Builder.defineMacro("__SYCL_CUDA_ARCH__", CUDAArchCode);
-    } else {
+    // Don't define __CUDA_ARCH__ if in SYCL device mode unless we are in
+    // SYCL-CUDA compatibility mode.
+    // For all other cases, define the macro.
+    if (!Opts.SYCLIsDevice || Opts.SYCLCUDACompat)
       Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
-    }
     if (GPU == OffloadArch::SM_90a)
       Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
     if (GPU == OffloadArch::SM_100a)
 
@@ -1858,16 +1858,6 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
   if (Body && isa_and_nonnull<CoroutineBodyStmt>(Body))
     llvm::append_range(FnArgs, FD->parameters());
 
-  // Generate a dummy __host__ function for compiling CUDA sources in SYCL.
-  if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
-      getLangOpts().SYCLIsHost && !FD->hasAttr<CUDAHostAttr>() &&
-      FD->hasAttr<CUDADeviceAttr>()) {
-    if (FD->getReturnType()->isVoidType())
-      Builder.CreateRetVoid();
-    else
-      Builder.CreateRet(llvm::UndefValue::get(Fn->getReturnType()));
-    return;
-  }
   // When compiling a CUDA file in SYCL device mode,
   // set weak ODR linkage for possibly duplicated functions.
   if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
@@ -1884,7 +1874,22 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
 
   // Generate the body of the function.
   PGO.assignRegionCounters(GD, CurFn);
-  if (isa<CXXDestructorDecl>(FD))
+  if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
+      getLangOpts().SYCLIsHost && !FD->hasAttr<CUDAHostAttr>() &&
+      FD->hasAttr<CUDADeviceAttr>()) {
+    // SYCL host compilation with CUDA compatibility enabled requires
+    // the creation of a host stub function for functions declared with
+    // the __device__ specifier but without the __host__ specifier.
+    // This is caused by the fact that SYCL doesn't use specifier like CUDA and
+    // so may have what can appear to be call from host to device. As we can't
+    // prevent the emission of such call, we need to produce a symbol for
+    // function with the __device__.
+    if (FD->getReturnType()->isVoidType())
+      Builder.CreateRetVoid();
+    else
+      Builder.CreateRet(llvm::UndefValue::get(Fn->getReturnType()));
+    Builder.ClearInsertionPoint();
+  } else if (isa<CXXDestructorDecl>(FD))
     EmitDestructorBody(Args);
   else if (isa<CXXConstructorDecl>(FD))
     EmitConstructorBody(Args);
 
@@ -75,6 +75,11 @@ using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
+static bool isSYCLCudaCompatEnabled(const ArgList &Args) {
+  return Args.hasFlag(options::OPT_fsycl_cuda_compat,
+                      options::OPT_fno_sycl_cuda_compat, false);
+}
+
 static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
   if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC,
                                options::OPT_fminimize_whitespace,
@@ -1176,7 +1181,8 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
 
   if (JA.isOffloading(Action::OFK_SYCL)) {
     getToolChain().addSYCLIncludeArgs(Args, CmdArgs);
-    if (Inputs[0].getType() == types::TY_CUDA) {
+    if (Inputs[0].getType() == types::TY_CUDA ||
+        isSYCLCudaCompatEnabled(Args)) {
       // Include __clang_cuda_runtime_wrapper.h in .cu SYCL compilation.
       getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
     }
@@ -5463,6 +5469,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   bool IsFPGASYCLOffloadDevice =
       IsSYCLDevice && Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga;
   const bool IsSYCLNativeCPU = isSYCLNativeCPU(TC);
+  const bool IsSYCLCUDACompat = isSYCLCudaCompatEnabled(Args);
 
   // Perform the SYCL host compilation using an external compiler if the user
   // requested.
@@ -5832,6 +5839,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-fno-sycl-esimd-build-host-code");
     }
 
+    if (IsSYCLCUDACompat) {
+      Args.addOptInFlag(CmdArgs, options::OPT_fsycl_cuda_compat,
+                        options::OPT_fno_sycl_cuda_compat);
+      // FIXME: clang's CUDA headers require this ...
+      // remove when clang/lib/Headers/__clang_cuda_builtin_vars.h no longer
+      // requires it.
+      CmdArgs.push_back("-fdeclspec");
+      // Note: assumes CUDA 9.0 or more (required by SYCL for CUDA)
+      CmdArgs.push_back("-fcuda-allow-variadic-functions");
+    }
+
     // Set options for both host and device
     if (SYCLStdArg) {
       SYCLStdArg->render(Args, CmdArgs);
@@ -5898,6 +5916,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     bool HasFPGA = false;
     for (auto TI = SYCLTCRange.first, TE = SYCLTCRange.second; TI != TE; ++TI) {
       llvm::Triple SYCLTriple = TI->second->getTriple();
+      if (SYCLTriple.isNVPTX() && IsSYCLCUDACompat && !IsSYCLDevice) {
+        CmdArgs.push_back("-aux-triple");
+        CmdArgs.push_back(Args.MakeArgString(SYCLTriple.normalize()));
+        // We need to figure out which CUDA version we're compiling for, as that
+        // determines how we load and launch GPU kernels.
+        auto *CTC = static_cast<const toolchains::CudaToolChain *>(TI->second);
+        assert(CTC && "Expected valid CUDA Toolchain.");
+        if (CTC->CudaInstallation.version() != CudaVersion::UNKNOWN)
+          CmdArgs.push_back(Args.MakeArgString(
+              Twine("-target-sdk-version=") +
+              CudaVersionToString(CTC->CudaInstallation.version())));
+        break;
+      }
       if (SYCLTriple.getSubArch() == llvm::Triple::SPIRSubArch_fpga) {
         HasFPGA = true;
         if (!IsSYCLDevice) {
 
@@ -4198,6 +4198,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   Opts.IncludeDefaultHeader = Args.hasArg(OPT_finclude_default_header);
   Opts.DeclareOpenCLBuiltins = Args.hasArg(OPT_fdeclare_opencl_builtins);
 
+  Opts.SYCLCUDACompat =
+      Args.hasArg(OPT_fsycl_cuda_compat, OPT_fno_sycl_cuda_compat, false);
+
   LangOptions::setLangDefaults(Opts, IK.getLanguage(), T, Includes, LangStd);
 
   // The key paths of codegen options defined in Options.td start with
 
@@ -1511,10 +1511,15 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   }
 
   // CUDA device path compilaton
-  if (LangOpts.CUDAIsDevice && !LangOpts.HIP && !LangOpts.isSYCL()) {
+  // Enabled if CUDA device compilation mode is on unless HIP is
+  // active or SYCL is active without CUDA compatibility enabled.
+  bool EnableCUDADevicePath = LangOpts.CUDAIsDevice && !LangOpts.HIP &&
+                              (!LangOpts.isSYCL() || LangOpts.SYCLCUDACompat);
+  if (EnableCUDADevicePath) {
     // The CUDA_ARCH value is set for the GPU target specified in the NVPTX
     // backend's target defines.
-    // Note: SYCL targeting nvptx-cuda relies on __SYCL_CUDA_ARCH__ instead.
+    // Note: SYCL targeting nvptx-cuda without SYCL-CUDA compatibility relies on
+    // __SYCL_CUDA_ARCH__ only instead.
     Builder.defineMacro("__CUDA_ARCH__");
   }
 
 
@@ -2093,9 +2093,19 @@ Sema::targetDiag(SourceLocation Loc, unsigned DiagID, const FunctionDecl *FD) {
     return LangOpts.OpenMPIsTargetDevice
                ? OpenMP().diagIfOpenMPDeviceCode(Loc, DiagID, FD)
                : OpenMP().diagIfOpenMPHostCode(Loc, DiagID, FD);
-  if (getLangOpts().CUDA)
+
+  // If SYCLCUDACompat is active, use the SYCL logic instead of CUDA when
+  // compiling the device side but the CUDA logic when compiling the host side.
+  // When compiling the device side, we need this as CUDA looks for the presence
+  // of __device__, __host__ etc. attributes to emit or defer diagnostics. These
+  // aren't always there as SYCL doesn't use such attribute.
+  if (getLangOpts().CUDA && !getLangOpts().SYCLCUDACompat)
     return getLangOpts().CUDAIsDevice ? CUDA().DiagIfDeviceCode(Loc, DiagID)
                                       : CUDA().DiagIfHostCode(Loc, DiagID);
+  // On the host side, __device__ acts as a guard like __SYCL_DEVICE_ONLY__
+  // macro, so use the CUDA logic here.
+  if (getLangOpts().SYCLIsHost && getLangOpts().SYCLCUDACompat)
+    return CUDA().DiagIfHostCode(Loc, DiagID);
 
   if (getLangOpts().SYCLIsDevice)
     return SYCL().DiagIfDeviceCode(Loc, DiagID);
 
@@ -222,11 +222,20 @@ SemaCUDA::CUDAVariableTarget SemaCUDA::IdentifyTarget(const VarDecl *Var) {
 // | hd | hd | HD  | HD  | (b) |
 //
 // In combined SYCL - CUDA mode
-// Sh - SYCL is host
-// Sd - SYCL is device
+// Sh - SYCL is host (SYCLIsDevice == false and SYCLIsHost == true)
+// Sd - SYCL is device (SYCLIsDevice == true and SYCLIsHost == false)
 //
 // Priority order: N, SS, HD, WS, --
 //
+// Note: we deviate from the actual meaning for
+//  N, SS, HD, WS, --.
+// Wrong side (WS) and -- (Never) are still used to raise error (delayed and
+// immediate respectively). Native (N), SameSide (SS) and HostDevice (HD) are
+// used to rank preference as 1st, 2nd or 3rd choice (N > SS > HD) to determine
+// the best viable function.
+//
+// Extra (x) specifies an alternative handling location from the one in H.
+//
 // |    |    |  host    |  cuda-dev  |  sycl-dev |     |
 // | F  | T  | Ph - Sh  |  Pd - Sh   |  Ph - Sd  |  H  |
 // |----+----+----------+------------+-----------+-----+
@@ -238,14 +247,14 @@ SemaCUDA::CUDAVariableTarget SemaCUDA::IdentifyTarget(const VarDecl *Var) {
 // | g  | g  |    --    |     --     |     --    | (a) |
 // | g  | h  |    --    |     --     |     --    | (e) |
 // | g  | hd |    HD    |     HD     |     HD    | (c) |
-// | h  | d  |    HD(y) |     WS(v)  |     N(x)  | ( ) |
+// | h  | d  |    HD(y1)|     WS(z)  |     N (x1)| ( ) |
 // | h  | g  |    N     |     N      |     N     | (c) |
-// | h  | h  |    N     |     N      |     SS(p) | ( ) |
-// | h  | hd |    HD    |     HD     |     HD    | ( ) |
-// | hd | d  |    HD(y) |     SS     |     N(x)  | ( ) |
-// | hd | g  |    SS    |     --     |    --(z)  |(d/a)|
-// | hd | h  |    SS    |     WS     |     SS    | (d) |
-// | hd | hd |    HD    |     HD     |     HD    | (b) |
+// | h  | h  |    N     |     N      |     SS(x2)| (c) |
+// | h  | hd |    SS(y5)|     HD     |     HD    | (b) |
+// | hd | d  |    HD(y3)|     SS     |     N (x1)| (d) |
+// | hd | g  |    N (y2)|     --     |     --(x3)|(d/a)|
+// | hd | h  |    N (y2)|     WS     |     HD(x4)| (d) |
+// | hd | hd |    SS(y4)|     HD     |     SS(x5)| (b) |
 
 SemaCUDA::CUDAFunctionPreference
 SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
@@ -266,7 +275,7 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
   // Pd - Sh -> CUDA device compilation for SYCL+CUDA
   if (getLangOpts().SYCLIsHost && getLangOpts().CUDA &&
       getLangOpts().CUDAIsDevice) {
-    // (v) allows a __host__ function to call a __device__ one. This is allowed
+    // (z) allows a __host__ function to call a __device__ one. This is allowed
     // for sycl-device compilation, since a regular function (implicitly
     // __host__) called by a SYCL kernel could end up calling a __device__ one.
     // In any case, __host__ functions are not emitted by the cuda-dev
@@ -280,36 +289,59 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
   if (getLangOpts().SYCLIsDevice && getLangOpts().CUDA &&
       !getLangOpts().CUDAIsDevice) {
     // (x), and (p) prefer __device__ function in SYCL-device compilation.
-    // (x) allows to pick a __device__ function.
+    // (x1) allows to pick a __device__ function.
     if ((CallerTarget == CUDAFunctionTarget::Host ||
          CallerTarget == CUDAFunctionTarget::HostDevice) &&
         CalleeTarget == CUDAFunctionTarget::Device)
       return CFP_Native;
-    // (p) lowers the preference of __host__ functions for favoring __device__
+    // (x2) lowers the preference of __host__ functions for favoring __device__
     // ones.
     if (CallerTarget == CUDAFunctionTarget::Host &&
         CalleeTarget == CUDAFunctionTarget::Host)
       return CFP_SameSide;
 
-    // (z)
+    // (x3)
     if (CallerTarget == CUDAFunctionTarget::HostDevice &&
         CalleeTarget == CUDAFunctionTarget::Global)
       return CFP_Never;
+    // (x4)
+    if (CallerTarget == CUDAFunctionTarget::HostDevice &&
+        CalleeTarget == CUDAFunctionTarget::Host)
+      return CFP_HostDevice;
+    // (x5)
+    if (CallerTarget == CUDAFunctionTarget::HostDevice &&
+        CalleeTarget == CUDAFunctionTarget::HostDevice)
+      return CFP_SameSide;
   }
 
-  // Ph - Sh -> host compilation for SYCL+CUDA
+  // (y) Ph - Sh -> host compilation for SYCL+CUDA
   if (getLangOpts().SYCLIsHost && getLangOpts().CUDA &&
       !getLangOpts().CUDAIsDevice) {
-    // (y) allows __host__ and __host__ __device__ functions to call a
-    // __device__ one. This could happen, if a __device__ function is defined
-    // without having a corresponding __host__. In this case, a dummy __host__
-    // function is generated. This dummy function is required since the lambda
-    // that forms the SYCL kernel (having host device attr.) needs to be
-    // compiled also for the host. (CallerTarget == CUDAFunctionTarget::Host) is added in case a
-    // regular function (implicitly __host__) is called by a SYCL kernel lambda.
-    if ((CallerTarget == CUDAFunctionTarget::Host || CallerTarget == CUDAFunctionTarget::HostDevice) &&
+    // In host mode, allows __host__ and __host__ __device__ functions
+    // to call a __device__ one, but we shouldn't emit the call as __device__
+    // functions are replaced with a trap. __host__ -> __device__ is normally
+    // CFP_Never, but we need to make it a defer diagnostic.
+    // (y1) h -> d
+    if (CallerTarget == CUDAFunctionTarget::Host &&
         CalleeTarget == CUDAFunctionTarget::Device)
       return CFP_HostDevice;
+    // (y2) hd -> h or hd ->g
+    if (CallerTarget == CUDAFunctionTarget::HostDevice &&
+        (CalleeTarget == CUDAFunctionTarget::Host ||
+         CalleeTarget == CUDAFunctionTarget::Global))
+      return CFP_Native;
+    // (y3) hd -> d
+    if (CallerTarget == CUDAFunctionTarget::HostDevice &&
+        CalleeTarget == CUDAFunctionTarget::Device)
+      return CFP_HostDevice;
+    // (y4) hd -> hd
+    if (CallerTarget == CUDAFunctionTarget::HostDevice &&
+        CalleeTarget == CUDAFunctionTarget::HostDevice)
+      return CFP_SameSide;
+    // (y5) h -> hd
+    if (CallerTarget == CUDAFunctionTarget::Host &&
+        CalleeTarget == CUDAFunctionTarget::HostDevice)
+      return CFP_SameSide;
   }
 
   // If one of the targets is invalid, the check always fails, no matter what
 
@@ -20450,9 +20450,13 @@ Sema::DeviceDiagnosticReason Sema::getEmissionReason(const FunctionDecl *FD) {
   if (FD->hasAttr<SYCLSimdAttr>())
     return Sema::DeviceDiagnosticReason::Esimd;
   if (FD->hasAttr<SYCLDeviceAttr>() || FD->hasAttr<SYCLKernelAttr>())
-    return Sema::DeviceDiagnosticReason::Sycl;
+    return getLangOpts().SYCLCUDACompat
+               ? Sema::DeviceDiagnosticReason::SyclCudaCompat
+               : Sema::DeviceDiagnosticReason::Sycl;
   // FIXME: Refine the logic for CUDA and OpenMP.
-  if (getLangOpts().CUDA)
+  // In SYCL-CUDA compat mode, don't return CudaDevice or CudaHost but return
+  // All just like in normal SYCL.
+  if (getLangOpts().CUDA && !getLangOpts().SYCLCUDACompat)
     return getLangOpts().CUDAIsDevice ? Sema::DeviceDiagnosticReason::CudaDevice
                                       : Sema::DeviceDiagnosticReason::CudaHost;
   if (getLangOpts().OpenMP)
@@ -20534,7 +20538,9 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(const FunctionDecl *FD,
         (T == CUDAFunctionTarget::Device || T == CUDAFunctionTarget::Global))
       return FunctionEmissionStatus::CUDADiscarded;
 
-    if (IsEmittedForExternalSymbol())
+    // Defer to SYCLIsDevice if in cuda compat mode
+    if ((LangOpts.CUDAIsDevice || !LangOpts.SYCLCUDACompat) &&
+        IsEmittedForExternalSymbol())
       return FunctionEmissionStatus::Emitted;
   }
 
 
@@ -397,6 +397,12 @@ bool SemaSYCL::isDeclAllowedInSYCLDeviceCode(const Decl *D) {
          FD->getBuiltinID() == Builtin::BI__builtin_printf))
       return true;
 
+    // Allow to use `::printf` only for CUDA.
+    if (getLangOpts().SYCLCUDACompat) {
+      if (FD->getBuiltinID() == Builtin::BIprintf)
+        return true;
+    }
+
     const DeclContext *DC = FD->getDeclContext();
     if (II && II->isStr("__spirv_ocl_printf") &&
         !FD->isDefined() &&
Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,`
`183`	`183`	`}`
`184`	`184`
`185`	`185`	`Opts.HIP = Lang == Language::HIP;`
`186`		`- Opts.CUDA = Lang == Language::CUDA \|\| Opts.HIP;`
	`186`	`+ Opts.CUDA = Lang == Language::CUDA \|\| Opts.HIP \|\| Opts.SYCLCUDACompat;`
`187`	`187`	`if (Opts.HIP) {`
`188`	`188`	`// HIP toolchain does not support 'Fast' FPOpFusion in backends since it`
`189`	`189`	`// fuses multiplication/addition instructions without contract flag from`