Skip to content

Commit 31608c2

Browse files
[SYCL][CUDA] Support CUDA and SYCL in the same TU (#7352)
This patch allows to compile a `.cu` files containing CUDA and SYCL codes in the same translation unit. Test in intel/llvm-test-suite#1377 --------- Co-authored-by: Steffen Larsen <[email protected]>
1 parent cbdb06a commit 31608c2

19 files changed

+541
-174
lines changed

clang/lib/CodeGen/CodeGenFunction.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1673,13 +1673,18 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
16731673
if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
16741674
getLangOpts().SYCLIsHost && !FD->hasAttr<CUDAHostAttr>() &&
16751675
FD->hasAttr<CUDADeviceAttr>()) {
1676-
Fn->setLinkage(llvm::Function::WeakODRLinkage);
16771676
if (FD->getReturnType()->isVoidType())
16781677
Builder.CreateRetVoid();
16791678
else
16801679
Builder.CreateRet(llvm::UndefValue::get(Fn->getReturnType()));
16811680
return;
16821681
}
1682+
// When compiling a CUDA file in SYCL device mode,
1683+
// set weak ODR linkage for possibly duplicated functions.
1684+
if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
1685+
getLangOpts().SYCLIsDevice &&
1686+
(FD->hasAttr<CUDADeviceAttr>() || FD->hasAttr<CUDAHostAttr>()))
1687+
Fn->setLinkage(llvm::Function::WeakODRLinkage);
16831688

16841689
// Generate the body of the function.
16851690
PGO.assignRegionCounters(GD, CurFn);

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 88 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,15 @@ static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
100100
llvm_unreachable("invalid C++ ABI kind");
101101
}
102102

103+
static bool SYCLCUDAIsHost(const clang::LangOptions &LangOpts) {
104+
// Return true for the host compilation of SYCL CUDA sources.
105+
return LangOpts.SYCLIsHost && LangOpts.CUDA && !LangOpts.CUDAIsDevice;
106+
}
107+
static bool SYCLCUDAIsSYCLDevice(const clang::LangOptions &LangOpts) {
108+
// Return true for the SYCL device compilation of SYCL CUDA sources.
109+
return LangOpts.SYCLIsDevice && LangOpts.CUDA && !LangOpts.CUDAIsDevice;
110+
}
111+
103112
CodeGenModule::CodeGenModule(ASTContext &C,
104113
IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
105114
const HeaderSearchOptions &HSO,
@@ -2913,13 +2922,23 @@ void CodeGenModule::EmitDeferred() {
29132922
for (GlobalDecl &D : CurDeclsToEmit) {
29142923
// Emit a dummy __host__ function if a legit one is not already present in
29152924
// case of SYCL compilation of CUDA sources.
2916-
if (LangOpts.CUDA && !LangOpts.CUDAIsDevice && LangOpts.SYCLIsHost) {
2925+
if (SYCLCUDAIsHost(LangOpts)) {
29172926
GlobalDecl OtherD;
29182927
if (lookupRepresentativeDecl(getMangledName(D), OtherD) &&
29192928
(D.getCanonicalDecl().getDecl() !=
2920-
OtherD.getCanonicalDecl().getDecl())) {
2929+
OtherD.getCanonicalDecl().getDecl()) &&
2930+
D.getCanonicalDecl().getDecl()->hasAttr<CUDADeviceAttr>())
2931+
continue;
2932+
}
2933+
// Emit a dummy __host__ function if a legit one is not already present in
2934+
// case of SYCL compilation of CUDA sources.
2935+
if (SYCLCUDAIsSYCLDevice(LangOpts)) {
2936+
GlobalDecl OtherD;
2937+
if (lookupRepresentativeDecl(getMangledName(D), OtherD) &&
2938+
(D.getCanonicalDecl().getDecl() !=
2939+
OtherD.getCanonicalDecl().getDecl()) &&
2940+
D.getCanonicalDecl().getDecl()->hasAttr<CUDAHostAttr>())
29212941
continue;
2922-
}
29232942
}
29242943
const ValueDecl *VD = cast<ValueDecl>(D.getDecl());
29252944
// If emitting for SYCL device, emit the deferred alias
@@ -3571,16 +3590,10 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
35713590
// their device-side incarnations.
35723591

35733592
// So device-only functions are the only things we skip, except for SYCL.
3574-
if (isa<FunctionDecl>(Global) && !Global->hasAttr<CUDAHostAttr>() &&
3575-
Global->hasAttr<CUDADeviceAttr>()) {
3576-
// In SYCL, every (CUDA) __device__ function needs to have a __host__
3577-
// counterpart that will be emitted in case of it is not already
3578-
// present.
3579-
if (LangOpts.SYCLIsHost && MustBeEmitted(Global) &&
3580-
MayBeEmittedEagerly(Global))
3581-
addDeferredDeclToEmit(GD);
3593+
if (!LangOpts.isSYCL() && isa<FunctionDecl>(Global) &&
3594+
!Global->hasAttr<CUDAHostAttr>() && Global->hasAttr<CUDADeviceAttr>())
35823595
return;
3583-
}
3596+
35843597
assert((isa<FunctionDecl>(Global) || isa<VarDecl>(Global)) &&
35853598
"Expected Variable or Function");
35863599
}
@@ -3605,8 +3618,13 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
36053618
if (const auto *FD = dyn_cast<FunctionDecl>(Global)) {
36063619
// Forward declarations are emitted lazily on first use.
36073620
if (!FD->doesThisDeclarationHaveABody()) {
3608-
if (!FD->doesDeclarationForceExternallyVisibleDefinition())
3609-
return;
3621+
if (!FD->doesDeclarationForceExternallyVisibleDefinition()) {
3622+
// Force the declaration in SYCL compilation of CUDA sources.
3623+
if (!((SYCLCUDAIsHost(LangOpts) && Global->hasAttr<CUDAHostAttr>()) ||
3624+
(SYCLCUDAIsSYCLDevice(LangOpts) &&
3625+
Global->hasAttr<CUDADeviceAttr>())))
3626+
return;
3627+
}
36103628

36113629
StringRef MangledName = getMangledName(GD);
36123630

@@ -3665,6 +3683,20 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
36653683
// function. If the global must always be emitted, do it eagerly if possible
36663684
// to benefit from cache locality.
36673685
if (MustBeEmitted(Global) && MayBeEmittedEagerly(Global)) {
3686+
// Avoid emitting the same __host__ __device__ functions,
3687+
// in SYCL-CUDA-host compilation, and
3688+
if (SYCLCUDAIsHost(LangOpts) && isa<FunctionDecl>(Global) &&
3689+
!Global->hasAttr<CUDAHostAttr>() && Global->hasAttr<CUDADeviceAttr>()) {
3690+
addDeferredDeclToEmit(GD);
3691+
return;
3692+
}
3693+
// in SYCL-CUDA-device compilation.
3694+
if (SYCLCUDAIsSYCLDevice(LangOpts) && isa<FunctionDecl>(Global) &&
3695+
Global->hasAttr<CUDAHostAttr>() && !Global->hasAttr<CUDADeviceAttr>()) {
3696+
addDeferredDeclToEmit(GD);
3697+
return;
3698+
}
3699+
36683700
// Emit the definition if it can't be deferred.
36693701
EmitGlobalDefinition(GD);
36703702
return;
@@ -3688,6 +3720,39 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
36883720
addDeferredDeclToEmit(GD);
36893721
EmittedDeferredDecls[MangledName] = GD;
36903722
} else {
3723+
3724+
// For SYCL compilation of CUDA sources,
3725+
if (LangOpts.isSYCL() && LangOpts.CUDA && !LangOpts.CUDAIsDevice) {
3726+
// in case of SYCL-CUDA-host,
3727+
if (LangOpts.SYCLIsHost) {
3728+
if (Global->hasAttr<CUDAHostAttr>()) {
3729+
// remove already present __device__ function.
3730+
auto DDI = DeferredDecls.find(MangledName);
3731+
if (DDI != DeferredDecls.end())
3732+
DeferredDecls.erase(DDI);
3733+
} else if (Global->hasAttr<CUDADeviceAttr>()) {
3734+
// do not insert a __device__ function if a __host__ one is present.
3735+
auto DDI = DeferredDecls.find(MangledName);
3736+
if (DDI != DeferredDecls.end())
3737+
return;
3738+
}
3739+
}
3740+
// in case of SYCL-CUDA-device,
3741+
if (LangOpts.SYCLIsDevice) {
3742+
if (Global->hasAttr<CUDADeviceAttr>()) {
3743+
// remove already present __host__ function.
3744+
auto DDI = DeferredDecls.find(MangledName);
3745+
if (DDI != DeferredDecls.end())
3746+
DeferredDecls.erase(DDI);
3747+
} else if (Global->hasAttr<CUDAHostAttr>()) {
3748+
// do not insert a __host__ function if a __device__ one is present.
3749+
auto DDI = DeferredDecls.find(MangledName);
3750+
if (DDI != DeferredDecls.end())
3751+
return;
3752+
}
3753+
}
3754+
}
3755+
36913756
// Otherwise, remember that we saw a deferred decl with this name. The
36923757
// first use of the mangled name will cause it to move into
36933758
// DeferredDeclsToEmit.
@@ -4399,8 +4464,16 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
43994464
// This is the first use or definition of a mangled name. If there is a
44004465
// deferred decl with this name, remember that we need to emit it at the end
44014466
// of the file.
4467+
// In SYCL compilation of CUDA sources, avoid the emission if the
4468+
// __device__/__host__ attributes do not match.
44024469
auto DDI = DeferredDecls.find(MangledName);
4403-
if (DDI != DeferredDecls.end()) {
4470+
if (DDI != DeferredDecls.end() &&
4471+
(!(getLangOpts().isSYCL() && getLangOpts().CUDA &&
4472+
!getLangOpts().CUDAIsDevice) ||
4473+
((DDI->second).getDecl()->hasAttr<CUDAHostAttr>() ==
4474+
D->hasAttr<CUDAHostAttr>() &&
4475+
(DDI->second).getDecl()->hasAttr<CUDADeviceAttr>() ==
4476+
D->hasAttr<CUDADeviceAttr>()))) {
44044477
// Move the potentially referenced deferred decl to the
44054478
// DeferredDeclsToEmit list, and remove it from DeferredDecls (since we
44064479
// don't need it anymore).

clang/lib/Driver/Driver.cpp

Lines changed: 41 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4954,10 +4954,6 @@ class OffloadingActionBuilder final {
49544954
if (auto *IA = dyn_cast<InputAction>(HostAction)) {
49554955
SYCLDeviceActions.clear();
49564956

4957-
// Skip CUDA Actions
4958-
if (IA->getType() == types::TY_CUDA)
4959-
return ABRT_Inactive;
4960-
49614957
// Options that are considered LinkerInput are not valid input actions
49624958
// to the device tool chain.
49634959
if (IA->getInputArg().getOption().hasFlag(options::LinkerInput))
@@ -5050,19 +5046,33 @@ class OffloadingActionBuilder final {
50505046

50515047
OffloadAction::DeviceDependences Dep;
50525048
Dep.add(*A, *TargetInfo.TC, TargetInfo.BoundArch, Action::OFK_SYCL);
5053-
AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
5049+
if (ExternalCudaAction) {
5050+
assert(
5051+
SYCLTargetInfoList.size() == 1 &&
5052+
"Number of SYCL actions and toolchains/boundarch pairs do not "
5053+
"match.");
5054+
5055+
// Link with external CUDA action.
5056+
ActionList LinkObjects;
5057+
LinkObjects.push_back(
5058+
C.MakeAction<OffloadAction>(Dep, A->getType()));
5059+
LinkObjects.push_back(ExternalCudaAction);
5060+
Action *DeviceLinkAction =
5061+
C.MakeAction<LinkJobAction>(LinkObjects, types::TY_LLVM_BC);
5062+
5063+
OffloadAction::DeviceDependences DDep;
5064+
DDep.add(*DeviceLinkAction, *TargetInfo.TC, TargetInfo.BoundArch,
5065+
Action::OFK_SYCL);
5066+
AL.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
5067+
5068+
ExternalCudaAction = nullptr;
5069+
} else {
5070+
AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
5071+
}
50545072
}
50555073
// We no longer need the action stored in this builder.
50565074
SYCLDeviceActions.clear();
50575075
}
5058-
5059-
if (ExternalCudaAction) {
5060-
assert(SYCLTargetInfoList.size() == 1 &&
5061-
"Number of SYCL actions and toolchains/boundarch pairs do not "
5062-
"match.");
5063-
AL.push_back(ExternalCudaAction);
5064-
ExternalCudaAction = nullptr;
5065-
}
50665076
}
50675077

50685078
// Return whether to use native bfloat16 library.
@@ -6298,6 +6308,14 @@ class OffloadingActionBuilder final {
62986308
if (DDeps.getActions().empty())
62996309
return HostAction;
63006310

6311+
// Add host-cuda-sycl offload kind for the SYCL compilation of .cu files
6312+
if (OffloadKind == (Action::OFK_Cuda | Action::OFK_SYCL)) {
6313+
OffloadAction::HostDependence HDep(
6314+
*HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
6315+
/*BoundArch=*/nullptr, Action::OFK_SYCL | Action::OFK_Cuda);
6316+
return C.MakeAction<OffloadAction>(HDep, DDeps);
6317+
}
6318+
63016319
// We have dependences we need to bundle together. We use an offload action
63026320
// for that.
63036321
OffloadAction::HostDependence HDep(
@@ -6859,17 +6877,15 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
68596877
}
68606878
for (auto &I : Inputs) {
68616879
std::string SrcFileName(I.second->getAsString(Args));
6862-
if (I.first != types::TY_CUDA) {
6863-
if ((I.first == types::TY_PP_C || I.first == types::TY_PP_CXX ||
6864-
types::isSrcFile(I.first))) {
6865-
// Unique ID is generated for source files and preprocessed files.
6866-
SmallString<128> ResultID;
6867-
llvm::sys::fs::createUniquePath("%%%%%%%%%%%%%%%%", ResultID, false);
6868-
addSYCLUniqueID(Args.MakeArgString(ResultID.str()), SrcFileName);
6869-
}
6870-
if (!types::isSrcFile(I.first))
6871-
continue;
6880+
if ((I.first == types::TY_PP_C || I.first == types::TY_PP_CXX ||
6881+
types::isSrcFile(I.first))) {
6882+
// Unique ID is generated for source files and preprocessed files.
6883+
SmallString<128> ResultID;
6884+
llvm::sys::fs::createUniquePath("%%%%%%%%%%%%%%%%", ResultID, false);
6885+
addSYCLUniqueID(Args.MakeArgString(ResultID.str()), SrcFileName);
68726886
}
6887+
if (!types::isSrcFile(I.first))
6888+
continue;
68736889

68746890
std::string TmpFileNameHeader;
68756891
std::string TmpFileNameFooter;
@@ -6971,11 +6987,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
69716987
// When performing -fsycl based compilations and generating dependency
69726988
// information, perform a specific dependency generation compilation which
69736989
// is not based on the source + footer compilation.
6974-
// TODO: Support SYCL offloading with CUDA files
69756990
if (Phase == phases::Preprocess && Args.hasArg(options::OPT_fsycl) &&
69766991
Args.hasArg(options::OPT_M_Group) &&
6977-
!Args.hasArg(options::OPT_fno_sycl_use_footer) &&
6978-
I.first != types::TY_CUDA) {
6992+
!Args.hasArg(options::OPT_fno_sycl_use_footer)) {
69796993
Action *PreprocessAction =
69806994
C.MakeAction<PreprocessJobAction>(Current, types::TY_Dependencies);
69816995
PreprocessAction->propagateHostOffloadInfo(Action::OFK_SYCL,
@@ -7512,16 +7526,14 @@ Action *Driver::ConstructPhaseAction(
75127526
"Cannot preprocess this input type!");
75137527
}
75147528
types::ID HostPPType = types::getPreprocessedType(Input->getType());
7515-
// TODO: Support SYCL offloading with CUDA files
75167529
if (Args.hasArg(options::OPT_fsycl) && HostPPType != types::TY_INVALID &&
75177530
!Args.hasArg(options::OPT_fno_sycl_use_footer) &&
75187531
TargetDeviceOffloadKind == Action::OFK_None &&
7519-
Input->getType() != types::TY_CUDA &&
75207532
Input->getType() != types::TY_CUDA_DEVICE) {
75217533
// Performing a host compilation with -fsycl. Append the integration
75227534
// footer to the source file.
75237535
auto *AppendFooter =
7524-
C.MakeAction<AppendFooterJobAction>(Input, types::TY_CXX);
7536+
C.MakeAction<AppendFooterJobAction>(Input, Input->getType());
75257537
// FIXME: There are 2 issues with dependency generation in regards to
75267538
// the integration footer that need to be addressed.
75277539
// 1) Input file referenced on the RHS of a dependency is based on the

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1280,8 +1280,13 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
12801280
if (JA.isOffloading(Action::OFK_HIP))
12811281
getToolChain().AddHIPIncludeArgs(Args, CmdArgs);
12821282

1283-
if (JA.isOffloading(Action::OFK_SYCL))
1283+
if (JA.isOffloading(Action::OFK_SYCL)) {
12841284
toolchains::SYCLToolChain::AddSYCLIncludeArgs(D, Args, CmdArgs);
1285+
if (Inputs[0].getType() == types::TY_CUDA) {
1286+
// Include __clang_cuda_runtime_wrapper.h in .cu SYCL compilation.
1287+
getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
1288+
}
1289+
}
12851290

12861291
// If we are offloading to a target via OpenMP we need to include the
12871292
// openmp_wrappers folder which contains alternative system headers.

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,11 @@ void CudaToolChain::addClangTargetOptions(
755755
if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
756756
options::OPT_fno_cuda_approx_transcendentals, false))
757757
CC1Args.push_back("-fcuda-approx-transcendentals");
758+
759+
if (DriverArgs.hasArg(options::OPT_fsycl)) {
760+
// Add these flags for .cu SYCL compilation.
761+
CC1Args.append({"-std=c++17", "-fsycl-is-host"});
762+
}
758763
}
759764

760765
if (DeviceOffloadingKind == Action::OFK_SYCL) {

clang/lib/Sema/Sema.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1968,13 +1968,14 @@ Sema::targetDiag(SourceLocation Loc, unsigned DiagID, FunctionDecl *FD) {
19681968
if (LangOpts.OpenMP)
19691969
return LangOpts.OpenMPIsDevice ? diagIfOpenMPDeviceCode(Loc, DiagID, FD)
19701970
: diagIfOpenMPHostCode(Loc, DiagID, FD);
1971-
if (getLangOpts().CUDA)
1972-
return getLangOpts().CUDAIsDevice ? CUDADiagIfDeviceCode(Loc, DiagID)
1973-
: CUDADiagIfHostCode(Loc, DiagID);
19741971

19751972
if (getLangOpts().SYCLIsDevice)
19761973
return SYCLDiagIfDeviceCode(Loc, DiagID);
19771974

1975+
if (getLangOpts().CUDA)
1976+
return getLangOpts().CUDAIsDevice ? CUDADiagIfDeviceCode(Loc, DiagID)
1977+
: CUDADiagIfHostCode(Loc, DiagID);
1978+
19781979
return SemaDiagnosticBuilder(SemaDiagnosticBuilder::K_Immediate, Loc, DiagID,
19791980
FD, *this, DeviceDiagnosticReason::All);
19801981
}

0 commit comments

Comments
 (0)