Skip to content

Commit 4d680f5

Browse files
committed
[HIP][Clang][Sema] Add Sema support for hipstdpar
This patch adds the Sema changes needed for enabling HIP parallel algorithm offload on AMDGPU targets. This change impacts the CUDA / HIP language specific checks, and only manifests if compiling in `hipstdpar` mode. In this case, we essentially do three things: 1. Allow device side callers to call host side callees - since the user visible HLL would be standard C++, with no annotations / restriction mechanisms, we cannot unambiguously establish that such a call is an error, so we conservatively allow all such calls, deferring actual cleanup to a subsequent pass over IR; 2. Allow host formed lambdas to capture by reference; 3. Allow device functions to use host global variables. Reviewed by: yaxunl Differential Revision: https://reviews.llvm.org/D155833
1 parent 5ec9faf commit 4d680f5

File tree

4 files changed

+106
-3
lines changed

4 files changed

+106
-3
lines changed

clang/lib/Sema/SemaCUDA.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,15 @@ Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
249249
(CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
250250
return CFP_Native;
251251

252+
// HipStdPar mode is special, in that assessing whether a device side call to
253+
// a host target is deferred to a subsequent pass, and cannot unambiguously be
254+
// adjudicated in the AST, hence we optimistically allow them to pass here.
255+
if (getLangOpts().HIPStdPar &&
256+
(CallerTarget == CFT_Global || CallerTarget == CFT_Device ||
257+
CallerTarget == CFT_HostDevice) &&
258+
CalleeTarget == CFT_Host)
259+
return CFP_HostDevice;
260+
252261
// (d) HostDevice behavior depends on compilation mode.
253262
if (CallerTarget == CFT_HostDevice) {
254263
// It's OK to call a compilation-mode matching function from an HD one.
@@ -895,7 +904,7 @@ void Sema::CUDACheckLambdaCapture(CXXMethodDecl *Callee,
895904
if (!ShouldCheck || !Capture.isReferenceCapture())
896905
return;
897906
auto DiagKind = SemaDiagnosticBuilder::K_Deferred;
898-
if (Capture.isVariableCapture()) {
907+
if (Capture.isVariableCapture() && !getLangOpts().HIPStdPar) {
899908
SemaDiagnosticBuilder(DiagKind, Capture.getLocation(),
900909
diag::err_capture_bad_target, Callee, *this)
901910
<< Capture.getVariable();

clang/lib/Sema/SemaExpr.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19157,7 +19157,7 @@ MarkVarDeclODRUsed(ValueDecl *V, SourceLocation Loc, Sema &SemaRef,
1915719157
// Diagnose ODR-use of host global variables in device functions.
1915819158
// Reference of device global variables in host functions is allowed
1915919159
// through shadow variables therefore it is not diagnosed.
19160-
if (SemaRef.LangOpts.CUDAIsDevice) {
19160+
if (SemaRef.LangOpts.CUDAIsDevice && !SemaRef.LangOpts.HIPStdPar) {
1916119161
SemaRef.targetDiag(Loc, diag::err_ref_bad_target)
1916219162
<< /*host*/ 2 << /*variable*/ 1 << Var << UserTarget;
1916319163
SemaRef.targetDiag(Var->getLocation(),

clang/lib/Sema/SemaStmtAsm.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,8 @@ StmtResult Sema::ActOnGCCAsmStmt(SourceLocation AsmLoc, bool IsSimple,
271271
OutputName = Names[i]->getName();
272272

273273
TargetInfo::ConstraintInfo Info(Literal->getString(), OutputName);
274-
if (!Context.getTargetInfo().validateOutputConstraint(Info)) {
274+
if (!Context.getTargetInfo().validateOutputConstraint(Info) &&
275+
!(LangOpts.HIPStdPar && LangOpts.CUDAIsDevice)) {
275276
targetDiag(Literal->getBeginLoc(),
276277
diag::err_asm_invalid_output_constraint)
277278
<< Info.getConstraintStr();
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// RUN: %clang_cc1 -x hip %s --hipstdpar -triple amdgcn-amd-amdhsa --std=c++17 \
2+
// RUN: -fcuda-is-device -emit-llvm -o /dev/null -verify
3+
4+
// Note: These would happen implicitly, within the implementation of the
5+
// accelerator specific algorithm library, and not from user code.
6+
7+
// Calls from the accelerator side to implicitly host (i.e. unannotated)
8+
// functions are fine.
9+
10+
// expected-no-diagnostics
11+
12+
#define __device__ __attribute__((device))
13+
#define __global__ __attribute__((global))
14+
15+
extern "C" void host_fn() {}
16+
17+
struct Dummy {};
18+
19+
struct S {
20+
S() {}
21+
~S() { host_fn(); }
22+
23+
int x;
24+
};
25+
26+
struct T {
27+
__device__ void hd() { host_fn(); }
28+
29+
__device__ void hd3();
30+
31+
void h() {}
32+
33+
void operator+();
34+
void operator-(const T&) {}
35+
36+
operator Dummy() { return Dummy(); }
37+
};
38+
39+
__device__ void T::hd3() { host_fn(); }
40+
41+
template <typename T> __device__ void hd2() { host_fn(); }
42+
43+
__global__ void kernel() { hd2<int>(); }
44+
45+
__device__ void hd() { host_fn(); }
46+
47+
template <typename T> __device__ void hd3() { host_fn(); }
48+
__device__ void device_fn() { hd3<int>(); }
49+
50+
__device__ void local_var() {
51+
S s;
52+
}
53+
54+
__device__ void explicit_destructor(S *s) {
55+
s->~S();
56+
}
57+
58+
__device__ void hd_member_fn() {
59+
T t;
60+
61+
t.hd();
62+
}
63+
64+
__device__ void h_member_fn() {
65+
T t;
66+
t.h();
67+
}
68+
69+
__device__ void unaryOp() {
70+
T t;
71+
(void) +t;
72+
}
73+
74+
__device__ void binaryOp() {
75+
T t;
76+
(void) (t - t);
77+
}
78+
79+
__device__ void implicitConversion() {
80+
T t;
81+
Dummy d = t;
82+
}
83+
84+
template <typename T>
85+
struct TmplStruct {
86+
template <typename U> __device__ void fn() {}
87+
};
88+
89+
template <>
90+
template <>
91+
__device__ void TmplStruct<int>::fn<int>() { host_fn(); }
92+
93+
__device__ void double_specialization() { TmplStruct<int>().fn<int>(); }

0 commit comments

Comments
 (0)