llvm · darkbuck · Jul 31, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -2050,6 +2050,17 @@ def Convergent : InheritableAttr {
   let SimpleHandler = 1;
 }
 
+def NoConvergent : InheritableAttr {
+  let Spellings = [Clang<"noconvergent">, Declspec<"noconvergent">];
+  let Subjects = SubjectList<[Function, Stmt], WarnDiag,
+                             "functions and statements">;
+  let LangOpts = [CUDA];
+  let Documentation = [NoConvergentDocs];
+  let SimpleHandler = 1;
+}
+
+def : MutualExclusions<[Convergent, NoConvergent]>;
+
 def NoInline : DeclOrStmtAttr {
   let Spellings = [CustomKeyword<"__noinline__">, GCC<"noinline">,
                    CXX11<"clang", "noinline">, C23<"clang", "noinline">,

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
@@ -1382,6 +1382,34 @@ Sample usage:
   }];
 }
 
+def NoConvergentDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+This attribute prevents a function from being treated as convergent, which
+means that optimizations can only move calls to that function to
+control-equivalent blocks. If a statement is marked as ``noconvergent`` and
+contains calls, it also prevents those calls from being treated as convergent.
+In other words, those calls are not restricted to only being moved to
+control-equivalent blocks.
+
+In languages following SPMD/SIMT programming model, e.g., CUDA/HIP, function
+declarations and calls are treated as convergent by default for correctness.
+This ``noconvergent`` attribute is helpful for developers to prevent them from
+being treated as convergent when it's safe.
+
+.. code-block:: c
+
+  __device__ float bar(float);
+  __device__ float foo(float) __attribute__((noconvergent)) {}
+
+  __device__ int example(void) {
+    float x;
+    [[clang::noconvergent]] x = bar(x);
+  }
+
+  }];
+}
+
 def NoSplitStackDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
@@ -2522,6 +2522,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
         }
       }
     }
+    // Remove 'convergent' if requested.
+    if (TargetDecl->hasAttr<NoConvergentAttr>())
+      FuncAttrs.removeAttribute(llvm::Attribute::Convergent);
   }
 
   // Add "sample-profile-suffix-elision-policy" attribute for internal linkage
@@ -5636,6 +5639,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     Attrs =
         Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
 
+  // Remove call-site convergent attribute if requested.
+  if (InNoConvergentAttributedStmt)
+    Attrs =
+        Attrs.removeFnAttribute(getLLVMContext(), llvm::Attribute::Convergent);
+
   // Apply some call-site-specific attributes.
   // TODO: work this into building the attribute set.
 

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
@@ -723,6 +723,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   bool nomerge = false;
   bool noinline = false;
   bool alwaysinline = false;
+  bool noconvergent = false;
   const CallExpr *musttail = nullptr;
 
   for (const auto *A : S.getAttrs()) {
@@ -738,6 +739,9 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
     case attr::AlwaysInline:
       alwaysinline = true;
       break;
+    case attr::NoConvergent:
+      noconvergent = true;
+      break;
     case attr::MustTail: {
       const Stmt *Sub = S.getSubStmt();
       const ReturnStmt *R = cast<ReturnStmt>(Sub);
@@ -756,6 +760,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   SaveAndRestore save_nomerge(InNoMergeAttributedStmt, nomerge);
   SaveAndRestore save_noinline(InNoInlineAttributedStmt, noinline);
   SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline);
+  SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent);
   SaveAndRestore save_musttail(MustTailCall, musttail);
   EmitStmt(S.getSubStmt(), S.getAttrs());
 }
@@ -2465,7 +2470,8 @@ static llvm::MDNode *getAsmSrcLocInfo(const StringLiteral *Str,
 
 static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                               bool HasUnwindClobber, bool ReadOnly,
-                              bool ReadNone, bool NoMerge, const AsmStmt &S,
+                              bool ReadNone, bool NoMerge, bool NoConvergent,
+                              const AsmStmt &S,
                               const std::vector<llvm::Type *> &ResultRegTypes,
                               const std::vector<llvm::Type *> &ArgElemTypes,
                               CodeGenFunction &CGF,
@@ -2506,11 +2512,11 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                                          llvm::ConstantAsMetadata::get(Loc)));
   }
 
-  if (CGF.getLangOpts().assumeFunctionsAreConvergent())
+  if (!NoConvergent && CGF.getLangOpts().assumeFunctionsAreConvergent())
     // Conservatively, mark all inline asm blocks in CUDA or OpenCL as
     // convergent (meaning, they may call an intrinsically convergent op, such
     // as bar.sync, and so can't have certain optimizations applied around
-    // them).
+    // them) unless it's explicitly marked 'noconvergent'.
     Result.addFnAttr(llvm::Attribute::Convergent);
   // Extract all of the register value results from the asm.
   if (ResultRegTypes.size() == 1) {
@@ -3040,9 +3046,10 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
   if (IsGCCAsmGoto) {
     CBR = Builder.CreateCallBr(IA, Fallthrough, Transfer, Args);
     EmitBlock(Fallthrough);
-    UpdateAsmCallInst(*CBR, HasSideEffect, false, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*CBR, HasSideEffect, /*HasUnwindClobber=*/false, ReadOnly,
+                      ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
     // Because we are emitting code top to bottom, we don't have enough
     // information at this point to know precisely whether we have a critical
     // edge. If we have outputs, split all indirect destinations.
@@ -3070,15 +3077,17 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
     }
   } else if (HasUnwindClobber) {
     llvm::CallBase *Result = EmitCallOrInvoke(IA, Args, "");
-    UpdateAsmCallInst(*Result, HasSideEffect, true, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/true,
+                      ReadOnly, ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
   } else {
     llvm::CallInst *Result =
         Builder.CreateCall(IA, Args, getBundlesForFunclet(IA));
-    UpdateAsmCallInst(*Result, HasSideEffect, false, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/false,
+                      ReadOnly, ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
   }
 
   EmitAsmStores(*this, S, RegResults, ResultRegTypes, ResultTruncRegTypes,

diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
@@ -612,6 +612,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// True if the current statement has always_inline attribute.
   bool InAlwaysInlineAttributedStmt = false;
 
+  /// True if the current statement has noconvergent attribute.
+  bool InNoConvergentAttributedStmt = false;
+
   // The CallExpr within the current statement that the musttail attribute
   // applies to.  nullptr if there is no 'musttail' on the current statement.
   const CallExpr *MustTailCall = nullptr;

diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -229,6 +229,19 @@ static Attr *handleNoMergeAttr(Sema &S, Stmt *St, const ParsedAttr &A,
   return ::new (S.Context) NoMergeAttr(S.Context, A);
 }
 
+static Attr *handleNoConvergentAttr(Sema &S, Stmt *St, const ParsedAttr &A,
+                                    SourceRange Range) {
+  CallExprFinder CEF(S, St);
+
+  if (!CEF.foundCallExpr() && !CEF.foundAsmStmt()) {
+    S.Diag(St->getBeginLoc(), diag::warn_attribute_ignored_no_calls_in_stmt)
+        << A;
+    return nullptr;
+  }
+
+  return ::new (S.Context) NoConvergentAttr(S.Context, A);
+}
+
 template <typename OtherAttr, int DiagIdx>
 static bool CheckStmtInlineAttr(Sema &SemaRef, const Stmt *OrigSt,
                                 const Stmt *CurSt,
@@ -671,6 +684,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
     return handleCodeAlignAttr(S, St, A);
   case ParsedAttr::AT_MSConstexpr:
     return handleMSConstexprAttr(S, St, A, Range);
+  case ParsedAttr::AT_NoConvergent:
+    return handleNoConvergentAttr(S, St, A, Range);
   default:
     // N.B., ClangAttrEmitter.cpp emits a diagnostic helper that ensures a
     // declaration attribute is not written on a statement, but this code is

diff --git a/clang/test/CodeGenCUDA/convergent.cu b/clang/test/CodeGenCUDA/convergent.cu
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
@@ -10,36 +11,89 @@
 
 #include "Inputs/cuda.h"
 
-// DEVICE: Function Attrs:
-// DEVICE-SAME: convergent
-// DEVICE-NEXT: define{{.*}} void @_Z3foov
+// DEVICE-LABEL: define dso_local void @_Z3foov(
+// DEVICE-SAME: ) #[[ATTR0:[0-9]+]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    ret void
+//
 __device__ void foo() {}
+// DEVICE-LABEL: define dso_local void @_Z3baxv(
+// DEVICE-SAME: ) #[[ATTR1:[0-9]+]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    ret void
+//
+[[clang::noconvergent]] __device__ void bax() {}
 
-// HOST: Function Attrs:
-// HOST-NOT: convergent
-// HOST-NEXT: define{{.*}} void @_Z3barv
-// DEVICE: Function Attrs:
-// DEVICE-SAME: convergent
-// DEVICE-NEXT: define{{.*}} void @_Z3barv
 __host__ __device__ void baz();
+
+__host__ __device__ float aliasf0(int) asm("something");
+__host__ __device__ [[clang::noconvergent]] float aliasf1(int) asm("somethingelse");
+
+// DEVICE-LABEL: define dso_local void @_Z3barv(
+// DEVICE-SAME: ) #[[ATTR0]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    [[X:%.*]] = alloca i32, align 4
+// DEVICE-NEXT:    call void @_Z3bazv() #[[ATTR4:[0-9]+]]
+// DEVICE-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", "=l"() #[[ATTR5:[0-9]+]], !srcloc [[META3:![0-9]+]]
+// DEVICE-NEXT:    store i32 [[TMP0]], ptr [[X]], align 4
+// DEVICE-NEXT:    call void asm sideeffect "trap", ""() #[[ATTR4]], !srcloc [[META4:![0-9]+]]
+// DEVICE-NEXT:    call void asm sideeffect "nop", ""() #[[ATTR6:[0-9]+]], !srcloc [[META5:![0-9]+]]
+// DEVICE-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// DEVICE-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 noundef [[TMP1]]) #[[ATTR4]]
+// DEVICE-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
+// DEVICE-NEXT:    [[CALL1:%.*]] = call contract noundef float @somethingelse(i32 noundef [[TMP2]]) #[[ATTR6]]
+// DEVICE-NEXT:    ret void
+//
+// HOST-LABEL: define dso_local void @_Z3barv(
+// HOST-SAME: ) #[[ATTR0:[0-9]+]] {
+// HOST-NEXT:  [[ENTRY:.*:]]
+// HOST-NEXT:    [[X:%.*]] = alloca i32, align 4
+// HOST-NEXT:    call void @_Z3bazv()
+// HOST-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", "=l,~{dirflag},~{fpsr},~{flags}"() #[[ATTR2:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// HOST-NEXT:    store i32 [[TMP0]], ptr [[X]], align 4
+// HOST-NEXT:    call void asm sideeffect "trap", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3:[0-9]+]], !srcloc [[META3:![0-9]+]]
+// HOST-NEXT:    call void asm sideeffect "nop", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3]], !srcloc [[META4:![0-9]+]]
+// HOST-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// HOST-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 noundef [[TMP1]])
+// HOST-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
+// HOST-NEXT:    [[CALL1:%.*]] = call contract noundef float @somethingelse(i32 noundef [[TMP2]])
+// HOST-NEXT:    ret void
+//
 __host__ __device__ void bar() {
-  // DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
   baz();
-  // DEVICE: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
   int x;
-  asm ("trap;" : "=l"(x));
-  // DEVICE: call void asm sideeffect "trap;", ""() [[ASM_ATTR:#[0-9]+]]
-  asm volatile ("trap;");
+  asm ("trap" : "=l"(x));
+  asm volatile ("trap");
+  [[clang::noconvergent]] { asm volatile ("nop"); }
+  aliasf0(x);
+  aliasf1(x);
 }
 
-// DEVICE: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
-// DEVICE: attributes [[BAZ_ATTR]] = {
-// DEVICE-SAME: convergent
-// DEVICE-SAME: }
-// DEVICE-DAG: attributes [[CALL_ATTR]] = { convergent
-// DEVICE-DAG: attributes [[ASM_ATTR]] = { convergent
-
-// HOST: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
-// HOST: attributes [[BAZ_ATTR]] = {
-// HOST-NOT: convergent
-// HOST-SAME: }
+
+//.
+// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR4]] = { convergent nounwind }
+// DEVICE: attributes #[[ATTR5]] = { convergent nounwind memory(none) }
+// DEVICE: attributes #[[ATTR6]] = { nounwind }
+//.
+// HOST: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// HOST: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// HOST: attributes #[[ATTR2]] = { nounwind memory(none) }
+// HOST: attributes #[[ATTR3]] = { nounwind }
+//.
+// DEVICE: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// DEVICE: [[META1:![0-9]+]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+// DEVICE: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// DEVICE: [[META3]] = !{i64 3120}
+// DEVICE: [[META4]] = !{i64 3155}
+// DEVICE: [[META5]] = !{i64 3206}
+//.
+// HOST: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// HOST: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// HOST: [[META2]] = !{i64 3120}
+// HOST: [[META3]] = !{i64 3155}
+// HOST: [[META4]] = !{i64 3206}
+//.
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -109,6 +109,7 @@
 // CHECK-NEXT: Naked (SubjectMatchRule_function)
 // CHECK-NEXT: NoBuiltin (SubjectMatchRule_function)
 // CHECK-NEXT: NoCommon (SubjectMatchRule_variable)
+// CHECK-NEXT: NoConvergent (SubjectMatchRule_function)
 // CHECK-NEXT: NoDebug (SubjectMatchRule_type_alias, SubjectMatchRule_hasType_functionType, SubjectMatchRule_objc_method, SubjectMatchRule_variable_not_is_parameter)
 // CHECK-NEXT: NoDestroy (SubjectMatchRule_variable)
 // CHECK-NEXT: NoDuplicate (SubjectMatchRule_function)

diff --git a/clang/test/SemaCUDA/attr-noconvergent.cu b/clang/test/SemaCUDA/attr-noconvergent.cu
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+__device__ float f0(float) __attribute__((noconvergent));
+__device__ __attribute__((noconvergent)) float f1(float);
+[[clang::noconvergent]] __device__ float f2(float);
+
+__device__ [[clang::noconvergent(1)]] float f3(float);
+// expected-error@-1 {{'noconvergent' attribute takes no arguments}}
+
+__device__ [[clang::noconvergent]] float g0;
+// expected-warning@-1 {{'noconvergent' attribute only applies to functions and statements}}
+
+__device__ __attribute__((convergent)) __attribute__((noconvergent)) float f4(float);
+// expected-error@-1 {{'noconvergent' and 'convergent' attributes are not compatible}}
+// expected-note@-2 {{conflicting attribute is here}}
+
+__device__ [[clang::noconvergent]] float f5(float);
+__device__ [[clang::convergent]] float f5(float);
+// expected-error@-1 {{'convergent' and 'noconvergent' attributes are not compatible}}
+// expected-note@-3 {{conflicting attribute is here}}
+
+__device__ float f5(float x) {
+  [[clang::noconvergent]] float y;
+// expected-warning@-1 {{'noconvergent' attribute only applies to functions and statements}}
+
+  float z;
+
+  [[clang::noconvergent]] z = 1;
+// expected-warning@-1 {{'noconvergent' attribute is ignored because there exists no call expression inside the statement}}
+
+  [[clang::noconvergent]] z = f0(x);
+}