Skip to content

[clang][CUDA] Add 'noconvergent' function and statement attribute #100637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -2050,6 +2050,17 @@ def Convergent : InheritableAttr {
let SimpleHandler = 1;
}

def NoConvergent : InheritableAttr {
let Spellings = [Clang<"noconvergent">, Declspec<"noconvergent">];
let Subjects = SubjectList<[Function, Stmt], WarnDiag,
"functions and statements">;
let LangOpts = [CUDA];
let Documentation = [NoConvergentDocs];
let SimpleHandler = 1;
}

def : MutualExclusions<[Convergent, NoConvergent]>;

def NoInline : DeclOrStmtAttr {
let Spellings = [CustomKeyword<"__noinline__">, GCC<"noinline">,
CXX11<"clang", "noinline">, C23<"clang", "noinline">,
Expand Down
28 changes: 28 additions & 0 deletions clang/include/clang/Basic/AttrDocs.td
Original file line number Diff line number Diff line change
Expand Up @@ -1382,6 +1382,34 @@ Sample usage:
}];
}

def NoConvergentDocs : Documentation {
let Category = DocCatFunction;
let Content = [{
This attribute prevents a function from being treated as convergent, which
means that optimizations can only move calls to that function to
control-equivalent blocks. If a statement is marked as ``noconvergent`` and
contains calls, it also prevents those calls from being treated as convergent.
In other words, those calls are not restricted to only being moved to
control-equivalent blocks.

In languages following SPMD/SIMT programming model, e.g., CUDA/HIP, function
declarations and calls are treated as convergent by default for correctness.
This ``noconvergent`` attribute is helpful for developers to prevent them from
being treated as convergent when it's safe.

.. code-block:: c

__device__ float bar(float);
__device__ float foo(float) __attribute__((noconvergent)) {}

__device__ int example(void) {
float x;
[[clang::noconvergent]] x = bar(x);
}

}];
}

def NoSplitStackDocs : Documentation {
let Category = DocCatFunction;
let Content = [{
Expand Down
8 changes: 8 additions & 0 deletions clang/lib/CodeGen/CGCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2522,6 +2522,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
}
}
}
// Remove 'convergent' if requested.
if (TargetDecl->hasAttr<NoConvergentAttr>())
FuncAttrs.removeAttribute(llvm::Attribute::Convergent);
}

// Add "sample-profile-suffix-elision-policy" attribute for internal linkage
Expand Down Expand Up @@ -5636,6 +5639,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
Attrs =
Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);

// Remove call-site convergent attribute if requested.
if (InNoConvergentAttributedStmt)
Attrs =
Attrs.removeFnAttribute(getLLVMContext(), llvm::Attribute::Convergent);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it easy to avoid adding this in the first place?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it easy to avoid adding this in the first place?

getTrivialDefaultFunctionAttributes needs an extra argument to avoid that. However, that helper is called in several places, and that extra info is not always available. Forcing a default value in those places seems unreasonable or incorrect.


// Apply some call-site-specific attributes.
// TODO: work this into building the attribute set.

Expand Down
33 changes: 21 additions & 12 deletions clang/lib/CodeGen/CGStmt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
bool nomerge = false;
bool noinline = false;
bool alwaysinline = false;
bool noconvergent = false;
const CallExpr *musttail = nullptr;

for (const auto *A : S.getAttrs()) {
Expand All @@ -738,6 +739,9 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
case attr::AlwaysInline:
alwaysinline = true;
break;
case attr::NoConvergent:
noconvergent = true;
break;
case attr::MustTail: {
const Stmt *Sub = S.getSubStmt();
const ReturnStmt *R = cast<ReturnStmt>(Sub);
Expand All @@ -756,6 +760,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
SaveAndRestore save_nomerge(InNoMergeAttributedStmt, nomerge);
SaveAndRestore save_noinline(InNoInlineAttributedStmt, noinline);
SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline);
SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent);
SaveAndRestore save_musttail(MustTailCall, musttail);
EmitStmt(S.getSubStmt(), S.getAttrs());
}
Expand Down Expand Up @@ -2465,7 +2470,8 @@ static llvm::MDNode *getAsmSrcLocInfo(const StringLiteral *Str,

static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
bool HasUnwindClobber, bool ReadOnly,
bool ReadNone, bool NoMerge, const AsmStmt &S,
bool ReadNone, bool NoMerge, bool NoConvergent,
const AsmStmt &S,
const std::vector<llvm::Type *> &ResultRegTypes,
const std::vector<llvm::Type *> &ArgElemTypes,
CodeGenFunction &CGF,
Expand Down Expand Up @@ -2506,11 +2512,11 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
llvm::ConstantAsMetadata::get(Loc)));
}

if (CGF.getLangOpts().assumeFunctionsAreConvergent())
if (!NoConvergent && CGF.getLangOpts().assumeFunctionsAreConvergent())
// Conservatively, mark all inline asm blocks in CUDA or OpenCL as
// convergent (meaning, they may call an intrinsically convergent op, such
// as bar.sync, and so can't have certain optimizations applied around
// them).
// them) unless it's explicitly marked 'noconvergent'.
Result.addFnAttr(llvm::Attribute::Convergent);
// Extract all of the register value results from the asm.
if (ResultRegTypes.size() == 1) {
Expand Down Expand Up @@ -3040,9 +3046,10 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
if (IsGCCAsmGoto) {
CBR = Builder.CreateCallBr(IA, Fallthrough, Transfer, Args);
EmitBlock(Fallthrough);
UpdateAsmCallInst(*CBR, HasSideEffect, false, ReadOnly, ReadNone,
InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
*this, RegResults);
UpdateAsmCallInst(*CBR, HasSideEffect, /*HasUnwindClobber=*/false, ReadOnly,
ReadNone, InNoMergeAttributedStmt,
InNoConvergentAttributedStmt, S, ResultRegTypes,
ArgElemTypes, *this, RegResults);
// Because we are emitting code top to bottom, we don't have enough
// information at this point to know precisely whether we have a critical
// edge. If we have outputs, split all indirect destinations.
Expand Down Expand Up @@ -3070,15 +3077,17 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
}
} else if (HasUnwindClobber) {
llvm::CallBase *Result = EmitCallOrInvoke(IA, Args, "");
UpdateAsmCallInst(*Result, HasSideEffect, true, ReadOnly, ReadNone,
InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
*this, RegResults);
UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/true,
ReadOnly, ReadNone, InNoMergeAttributedStmt,
InNoConvergentAttributedStmt, S, ResultRegTypes,
ArgElemTypes, *this, RegResults);
} else {
llvm::CallInst *Result =
Builder.CreateCall(IA, Args, getBundlesForFunclet(IA));
UpdateAsmCallInst(*Result, HasSideEffect, false, ReadOnly, ReadNone,
InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
*this, RegResults);
UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/false,
ReadOnly, ReadNone, InNoMergeAttributedStmt,
InNoConvergentAttributedStmt, S, ResultRegTypes,
ArgElemTypes, *this, RegResults);
}

EmitAsmStores(*this, S, RegResults, ResultRegTypes, ResultTruncRegTypes,
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/CodeGen/CodeGenFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,9 @@ class CodeGenFunction : public CodeGenTypeCache {
/// True if the current statement has always_inline attribute.
bool InAlwaysInlineAttributedStmt = false;

/// True if the current statement has noconvergent attribute.
bool InNoConvergentAttributedStmt = false;

// The CallExpr within the current statement that the musttail attribute
// applies to. nullptr if there is no 'musttail' on the current statement.
const CallExpr *MustTailCall = nullptr;
Expand Down
15 changes: 15 additions & 0 deletions clang/lib/Sema/SemaStmtAttr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,19 @@ static Attr *handleNoMergeAttr(Sema &S, Stmt *St, const ParsedAttr &A,
return ::new (S.Context) NoMergeAttr(S.Context, A);
}

static Attr *handleNoConvergentAttr(Sema &S, Stmt *St, const ParsedAttr &A,
SourceRange Range) {
CallExprFinder CEF(S, St);

if (!CEF.foundCallExpr() && !CEF.foundAsmStmt()) {
S.Diag(St->getBeginLoc(), diag::warn_attribute_ignored_no_calls_in_stmt)
<< A;
return nullptr;
}

return ::new (S.Context) NoConvergentAttr(S.Context, A);
}

template <typename OtherAttr, int DiagIdx>
static bool CheckStmtInlineAttr(Sema &SemaRef, const Stmt *OrigSt,
const Stmt *CurSt,
Expand Down Expand Up @@ -671,6 +684,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
return handleCodeAlignAttr(S, St, A);
case ParsedAttr::AT_MSConstexpr:
return handleMSConstexprAttr(S, St, A, Range);
case ParsedAttr::AT_NoConvergent:
return handleNoConvergentAttr(S, St, A, Range);
default:
// N.B., ClangAttrEmitter.cpp emits a diagnostic helper that ensures a
// declaration attribute is not written on a statement, but this code is
Expand Down
104 changes: 79 additions & 25 deletions clang/test/CodeGenCUDA/convergent.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
// REQUIRES: x86-registered-target
// REQUIRES: nvptx-registered-target

Expand All @@ -10,36 +11,89 @@

#include "Inputs/cuda.h"

// DEVICE: Function Attrs:
// DEVICE-SAME: convergent
// DEVICE-NEXT: define{{.*}} void @_Z3foov
// DEVICE-LABEL: define dso_local void @_Z3foov(
// DEVICE-SAME: ) #[[ATTR0:[0-9]+]] {
// DEVICE-NEXT: [[ENTRY:.*:]]
// DEVICE-NEXT: ret void
//
__device__ void foo() {}
// DEVICE-LABEL: define dso_local void @_Z3baxv(
// DEVICE-SAME: ) #[[ATTR1:[0-9]+]] {
// DEVICE-NEXT: [[ENTRY:.*:]]
// DEVICE-NEXT: ret void
//
[[clang::noconvergent]] __device__ void bax() {}

// HOST: Function Attrs:
// HOST-NOT: convergent
// HOST-NEXT: define{{.*}} void @_Z3barv
// DEVICE: Function Attrs:
// DEVICE-SAME: convergent
// DEVICE-NEXT: define{{.*}} void @_Z3barv
__host__ __device__ void baz();

__host__ __device__ float aliasf0(int) asm("something");
__host__ __device__ [[clang::noconvergent]] float aliasf1(int) asm("somethingelse");

// DEVICE-LABEL: define dso_local void @_Z3barv(
// DEVICE-SAME: ) #[[ATTR0]] {
// DEVICE-NEXT: [[ENTRY:.*:]]
// DEVICE-NEXT: [[X:%.*]] = alloca i32, align 4
// DEVICE-NEXT: call void @_Z3bazv() #[[ATTR4:[0-9]+]]
// DEVICE-NEXT: [[TMP0:%.*]] = call i32 asm "trap", "=l"() #[[ATTR5:[0-9]+]], !srcloc [[META3:![0-9]+]]
// DEVICE-NEXT: store i32 [[TMP0]], ptr [[X]], align 4
// DEVICE-NEXT: call void asm sideeffect "trap", ""() #[[ATTR4]], !srcloc [[META4:![0-9]+]]
// DEVICE-NEXT: call void asm sideeffect "nop", ""() #[[ATTR6:[0-9]+]], !srcloc [[META5:![0-9]+]]
// DEVICE-NEXT: [[TMP1:%.*]] = load i32, ptr [[X]], align 4
// DEVICE-NEXT: [[CALL:%.*]] = call contract noundef float @something(i32 noundef [[TMP1]]) #[[ATTR4]]
// DEVICE-NEXT: [[TMP2:%.*]] = load i32, ptr [[X]], align 4
// DEVICE-NEXT: [[CALL1:%.*]] = call contract noundef float @somethingelse(i32 noundef [[TMP2]]) #[[ATTR6]]
// DEVICE-NEXT: ret void
//
// HOST-LABEL: define dso_local void @_Z3barv(
// HOST-SAME: ) #[[ATTR0:[0-9]+]] {
// HOST-NEXT: [[ENTRY:.*:]]
// HOST-NEXT: [[X:%.*]] = alloca i32, align 4
// HOST-NEXT: call void @_Z3bazv()
// HOST-NEXT: [[TMP0:%.*]] = call i32 asm "trap", "=l,~{dirflag},~{fpsr},~{flags}"() #[[ATTR2:[0-9]+]], !srcloc [[META2:![0-9]+]]
// HOST-NEXT: store i32 [[TMP0]], ptr [[X]], align 4
// HOST-NEXT: call void asm sideeffect "trap", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3:[0-9]+]], !srcloc [[META3:![0-9]+]]
// HOST-NEXT: call void asm sideeffect "nop", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3]], !srcloc [[META4:![0-9]+]]
// HOST-NEXT: [[TMP1:%.*]] = load i32, ptr [[X]], align 4
// HOST-NEXT: [[CALL:%.*]] = call contract noundef float @something(i32 noundef [[TMP1]])
// HOST-NEXT: [[TMP2:%.*]] = load i32, ptr [[X]], align 4
// HOST-NEXT: [[CALL1:%.*]] = call contract noundef float @somethingelse(i32 noundef [[TMP2]])
// HOST-NEXT: ret void
//
__host__ __device__ void bar() {
// DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
baz();
// DEVICE: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
int x;
asm ("trap;" : "=l"(x));
// DEVICE: call void asm sideeffect "trap;", ""() [[ASM_ATTR:#[0-9]+]]
asm volatile ("trap;");
asm ("trap" : "=l"(x));
asm volatile ("trap");
[[clang::noconvergent]] { asm volatile ("nop"); }
aliasf0(x);
aliasf1(x);
}

// DEVICE: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
// DEVICE: attributes [[BAZ_ATTR]] = {
// DEVICE-SAME: convergent
// DEVICE-SAME: }
// DEVICE-DAG: attributes [[CALL_ATTR]] = { convergent
// DEVICE-DAG: attributes [[ASM_ATTR]] = { convergent

// HOST: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
// HOST: attributes [[BAZ_ATTR]] = {
// HOST-NOT: convergent
// HOST-SAME: }

//.
// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
// DEVICE: attributes #[[ATTR4]] = { convergent nounwind }
// DEVICE: attributes #[[ATTR5]] = { convergent nounwind memory(none) }
// DEVICE: attributes #[[ATTR6]] = { nounwind }
//.
// HOST: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
// HOST: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
// HOST: attributes #[[ATTR2]] = { nounwind memory(none) }
// HOST: attributes #[[ATTR3]] = { nounwind }
//.
// DEVICE: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
// DEVICE: [[META1:![0-9]+]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
// DEVICE: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
// DEVICE: [[META3]] = !{i64 3120}
// DEVICE: [[META4]] = !{i64 3155}
// DEVICE: [[META5]] = !{i64 3206}
//.
// HOST: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
// HOST: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
// HOST: [[META2]] = !{i64 3120}
// HOST: [[META3]] = !{i64 3155}
// HOST: [[META4]] = !{i64 3206}
//.
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
// CHECK-NEXT: Naked (SubjectMatchRule_function)
// CHECK-NEXT: NoBuiltin (SubjectMatchRule_function)
// CHECK-NEXT: NoCommon (SubjectMatchRule_variable)
// CHECK-NEXT: NoConvergent (SubjectMatchRule_function)
// CHECK-NEXT: NoDebug (SubjectMatchRule_type_alias, SubjectMatchRule_hasType_functionType, SubjectMatchRule_objc_method, SubjectMatchRule_variable_not_is_parameter)
// CHECK-NEXT: NoDestroy (SubjectMatchRule_variable)
// CHECK-NEXT: NoDuplicate (SubjectMatchRule_function)
Expand Down
34 changes: 34 additions & 0 deletions clang/test/SemaCUDA/attr-noconvergent.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s

#include "Inputs/cuda.h"

__device__ float f0(float) __attribute__((noconvergent));
__device__ __attribute__((noconvergent)) float f1(float);
[[clang::noconvergent]] __device__ float f2(float);

__device__ [[clang::noconvergent(1)]] float f3(float);
// expected-error@-1 {{'noconvergent' attribute takes no arguments}}

__device__ [[clang::noconvergent]] float g0;
// expected-warning@-1 {{'noconvergent' attribute only applies to functions and statements}}

__device__ __attribute__((convergent)) __attribute__((noconvergent)) float f4(float);
// expected-error@-1 {{'noconvergent' and 'convergent' attributes are not compatible}}
// expected-note@-2 {{conflicting attribute is here}}

__device__ [[clang::noconvergent]] float f5(float);
__device__ [[clang::convergent]] float f5(float);
// expected-error@-1 {{'convergent' and 'noconvergent' attributes are not compatible}}
// expected-note@-3 {{conflicting attribute is here}}

__device__ float f5(float x) {
[[clang::noconvergent]] float y;
// expected-warning@-1 {{'noconvergent' attribute only applies to functions and statements}}

float z;

[[clang::noconvergent]] z = 1;
// expected-warning@-1 {{'noconvergent' attribute is ignored because there exists no call expression inside the statement}}

[[clang::noconvergent]] z = f0(x);
}