Skip to content

Commit c1ac6d2

Browse files
authored
[AMDGPU] Add amdgpu-as MMRA for fences (llvm#78572)
Using MMRAs, allow `builtin_amdgcn_fence` to emit fences that only target one or more address spaces, instead of fencing all address spaces at once. This is done through a `amdgpu-as` MMRA. Currently focused on OpenCL fences, but can very easily support more AS names and codegen on more than just fences.
1 parent f1d13bb commit c1ac6d2

File tree

9 files changed

+3236
-20
lines changed

9 files changed

+3236
-20
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4403,6 +4403,7 @@ immediately after the name being declared.
44034403
For example, this applies the GNU ``unused`` attribute to ``a`` and ``f``, and
44044404
also applies the GNU ``noreturn`` attribute to ``f``.
44054405
4406+
Examples:
44064407
.. code-block:: c++
44074408
44084409
[[gnu::unused]] int a, f [[gnu::noreturn]] ();
@@ -4412,6 +4413,42 @@ Target-Specific Extensions
44124413
44134414
Clang supports some language features conditionally on some targets.
44144415
4416+
AMDGPU Language Extensions
4417+
--------------------------
4418+
4419+
__builtin_amdgcn_fence
4420+
^^^^^^^^^^^^^^^^^^^^^^
4421+
4422+
``__builtin_amdgcn_fence`` emits a fence.
4423+
4424+
* ``unsigned`` atomic ordering, e.g. ``__ATOMIC_ACQUIRE``
4425+
* ``const char *`` synchronization scope, e.g. ``workgroup``
4426+
* Zero or more ``const char *`` address spaces names.
4427+
4428+
The address spaces arguments must be one of the following string literals:
4429+
4430+
* ``"local"``
4431+
* ``"global"``
4432+
4433+
If one or more address space name are provided, the code generator will attempt
4434+
to emit potentially faster instructions that order access to at least those
4435+
address spaces.
4436+
Emitting such instructions may not always be possible and the compiler is free
4437+
to fence more aggressively.
4438+
4439+
If no address spaces names are provided, all address spaces are fenced.
4440+
4441+
.. code-block:: c++
4442+
4443+
// Fence all address spaces.
4444+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
4445+
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent");
4446+
4447+
// Fence only requested address spaces.
4448+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local")
4449+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup", "local", "global")
4450+
4451+
44154452
ARM/AArch64 Language Extensions
44164453
-------------------------------
44174454
@@ -5602,4 +5639,4 @@ Compiling different TUs depending on these flags (including use of
56025639
``std::hardware_constructive_interference`` or
56035640
``std::hardware_destructive_interference``) with different compilers, macro
56045641
definitions, or architecture flags will lead to ODR violations and should be
5605-
avoided.
5642+
avoided.

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n")
6868
BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
6969
BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
7070
BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
71-
BUILTIN(__builtin_amdgcn_fence, "vUicC*", "n")
71+
BUILTIN(__builtin_amdgcn_fence, "vUicC*.", "n")
7272
BUILTIN(__builtin_amdgcn_groupstaticsize, "Ui", "n")
7373
BUILTIN(__builtin_amdgcn_wavefrontsize, "Ui", "nc")
7474

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#include "llvm/IR/IntrinsicsX86.h"
5858
#include "llvm/IR/MDBuilder.h"
5959
#include "llvm/IR/MatrixBuilder.h"
60+
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
6061
#include "llvm/Support/ConvertUTF.h"
6162
#include "llvm/Support/MathExtras.h"
6263
#include "llvm/Support/ScopedPrinter.h"
@@ -18327,6 +18328,29 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
1832718328
return nullptr;
1832818329
}
1832918330

18331+
void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
18332+
const CallExpr *E) {
18333+
constexpr const char *Tag = "amdgpu-as";
18334+
18335+
LLVMContext &Ctx = Inst->getContext();
18336+
SmallVector<MMRAMetadata::TagT, 3> MMRAs;
18337+
for (unsigned K = 2; K < E->getNumArgs(); ++K) {
18338+
llvm::Value *V = EmitScalarExpr(E->getArg(K));
18339+
StringRef AS;
18340+
if (llvm::getConstantStringInfo(V, AS)) {
18341+
MMRAs.push_back({Tag, AS});
18342+
// TODO: Delete the resulting unused constant?
18343+
continue;
18344+
}
18345+
CGM.Error(E->getExprLoc(),
18346+
"expected an address space name as a string literal");
18347+
}
18348+
18349+
llvm::sort(MMRAs);
18350+
MMRAs.erase(llvm::unique(MMRAs), MMRAs.end());
18351+
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
18352+
}
18353+
1833018354
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1833118355
const CallExpr *E) {
1833218356
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -18997,7 +19021,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1899719021
case AMDGPU::BI__builtin_amdgcn_fence: {
1899819022
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)),
1899919023
EmitScalarExpr(E->getArg(1)), AO, SSID);
19000-
return Builder.CreateFence(AO, SSID);
19024+
FenceInst *Fence = Builder.CreateFence(AO, SSID);
19025+
if (E->getNumArgs() > 2)
19026+
AddAMDGPUFenceAddressSpaceMMRA(Fence, E);
19027+
return Fence;
1900119028
}
1900219029
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1900319030
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4635,6 +4635,9 @@ class CodeGenFunction : public CodeGenTypeCache {
46354635
llvm::Value *EmitHexagonBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
46364636
llvm::Value *EmitRISCVBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
46374637
ReturnValueSlot ReturnValue);
4638+
4639+
void AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
4640+
const CallExpr *E);
46384641
void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope,
46394642
llvm::AtomicOrdering &AO,
46404643
llvm::SyncScope::ID &SSID);
Lines changed: 96 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,111 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
12
// REQUIRES: amdgpu-registered-target
23
// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \
3-
// RUN: -triple=amdgcn-amd-amdhsa | opt -S | FileCheck %s
4+
// RUN: -triple=amdgcn-amd-amdhsa | FileCheck %s
45

6+
// CHECK-LABEL: define dso_local void @_Z25test_memory_fence_successv(
7+
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
8+
// CHECK-NEXT: entry:
9+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst
10+
// CHECK-NEXT: fence syncscope("agent") acquire
11+
// CHECK-NEXT: fence seq_cst
12+
// CHECK-NEXT: fence syncscope("agent") acq_rel
13+
// CHECK-NEXT: fence syncscope("workgroup") release
14+
// CHECK-NEXT: ret void
15+
//
516
void test_memory_fence_success() {
6-
// CHECK-LABEL: test_memory_fence_success
717

8-
// CHECK: fence syncscope("workgroup") seq_cst
918
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
1019

11-
// CHECK: fence syncscope("agent") acquire
1220
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent");
1321

14-
// CHECK: fence seq_cst
1522
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
1623

17-
// CHECK: fence syncscope("agent") acq_rel
1824
__builtin_amdgcn_fence(4, "agent");
1925

20-
// CHECK: fence syncscope("workgroup") release
2126
__builtin_amdgcn_fence(3, "workgroup");
2227
}
28+
29+
// CHECK-LABEL: define dso_local void @_Z10test_localv(
30+
// CHECK-SAME: ) #[[ATTR0]] {
31+
// CHECK-NEXT: entry:
32+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]]
33+
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META3]]
34+
// CHECK-NEXT: fence seq_cst, !mmra [[META3]]
35+
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META3]]
36+
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META3]]
37+
// CHECK-NEXT: ret void
38+
//
39+
void test_local() {
40+
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local");
41+
42+
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent", "local");
43+
44+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "", "local");
45+
46+
__builtin_amdgcn_fence(4, "agent", "local");
47+
48+
__builtin_amdgcn_fence(3, "workgroup", "local");
49+
}
50+
51+
52+
// CHECK-LABEL: define dso_local void @_Z11test_globalv(
53+
// CHECK-SAME: ) #[[ATTR0]] {
54+
// CHECK-NEXT: entry:
55+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]]
56+
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META4]]
57+
// CHECK-NEXT: fence seq_cst, !mmra [[META4]]
58+
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META4]]
59+
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META4]]
60+
// CHECK-NEXT: ret void
61+
//
62+
void test_global() {
63+
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "global");
64+
65+
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent", "global");
66+
67+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "", "global");
68+
69+
__builtin_amdgcn_fence(4, "agent", "global");
70+
71+
__builtin_amdgcn_fence(3, "workgroup", "global");
72+
}
73+
74+
// CHECK-LABEL: define dso_local void @_Z10test_imagev(
75+
// CHECK-SAME: ) #[[ATTR0]] {
76+
// CHECK-NEXT: entry:
77+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META3]]
78+
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META3]]
79+
// CHECK-NEXT: fence seq_cst, !mmra [[META3]]
80+
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META3]]
81+
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META3]]
82+
// CHECK-NEXT: ret void
83+
//
84+
void test_image() {
85+
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local");
86+
87+
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent", "local");
88+
89+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "", "local");
90+
91+
__builtin_amdgcn_fence(4, "agent", "local");
92+
93+
__builtin_amdgcn_fence(3, "workgroup", "local");
94+
}
95+
96+
// CHECK-LABEL: define dso_local void @_Z10test_mixedv(
97+
// CHECK-SAME: ) #[[ATTR0]] {
98+
// CHECK-NEXT: entry:
99+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
100+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META5]]
101+
// CHECK-NEXT: ret void
102+
//
103+
void test_mixed() {
104+
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local", "global");
105+
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local", "local", "global", "local", "local");
106+
}
107+
//.
108+
// CHECK: [[META3]] = !{!"amdgpu-as", !"local"}
109+
// CHECK: [[META4]] = !{!"amdgpu-as", !"global"}
110+
// CHECK: [[META5]] = !{[[META4]], [[META3]]}
111+
//.

clang/test/SemaOpenCL/builtins-amdgcn-error.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,8 @@ void test_ds_fmaxf(local float *out, float src, int a) {
155155
void test_fence() {
156156
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST + 1, "workgroup"); // expected-warning {{memory order argument to atomic operation is invalid}}
157157
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE - 1, "workgroup"); // expected-warning {{memory order argument to atomic operation is invalid}}
158-
__builtin_amdgcn_fence(4); // expected-error {{too few arguments to function call, expected 2}}
159-
__builtin_amdgcn_fence(4, 4, 4); // expected-error {{too many arguments to function call, expected 2}}
158+
__builtin_amdgcn_fence(4); // expected-error {{too few arguments to function call, expected at least 2, have 1}}
159+
__builtin_amdgcn_fence(4, 4, 4); // expected-error {{incompatible integer to pointer conversion passing 'int' to parameter of type 'const char *'}}
160160
__builtin_amdgcn_fence(3.14, ""); // expected-warning {{implicit conversion from 'double' to 'unsigned int' changes value from 3.14 to 3}}
161161
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, 5); // expected-error {{incompatible integer to pointer conversion passing 'int' to parameter of type 'const char *'}}
162162
const char ptr[] = "workgroup";

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818
#include "GCNSubtarget.h"
1919
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
2020
#include "llvm/ADT/BitmaskEnum.h"
21+
#include "llvm/ADT/StringExtras.h"
2122
#include "llvm/CodeGen/MachineBasicBlock.h"
2223
#include "llvm/CodeGen/MachineFunctionPass.h"
2324
#include "llvm/IR/DiagnosticInfo.h"
25+
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
2426
#include "llvm/Support/AtomicOrdering.h"
2527
#include "llvm/TargetParser/TargetParser.h"
2628

@@ -678,6 +680,49 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
678680
bool runOnMachineFunction(MachineFunction &MF) override;
679681
};
680682

683+
static const StringMap<SIAtomicAddrSpace> ASNames = {{
684+
{"global", SIAtomicAddrSpace::GLOBAL},
685+
{"local", SIAtomicAddrSpace::LDS},
686+
}};
687+
688+
void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
689+
const MachineFunction *MF = MI.getMF();
690+
const Function &Fn = MF->getFunction();
691+
SmallString<128> Str;
692+
raw_svector_ostream OS(Str);
693+
OS << "unknown address space '" << AS << "'; expected one of ";
694+
ListSeparator LS;
695+
for (const auto &[Name, Val] : ASNames)
696+
OS << LS << '\'' << Name << '\'';
697+
DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning);
698+
Fn.getContext().diagnose(BadTag);
699+
}
700+
701+
/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA.
702+
/// If this tag isn't present, or if it has no meaningful values, returns \p
703+
/// Default. Otherwise returns all the address spaces concerned by the MMRA.
704+
static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI,
705+
SIAtomicAddrSpace Default) {
706+
static constexpr StringLiteral FenceASPrefix = "amdgpu-as";
707+
708+
auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
709+
if (!MMRA)
710+
return Default;
711+
712+
SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
713+
for (const auto &[Prefix, Suffix] : MMRA) {
714+
if (Prefix != FenceASPrefix)
715+
continue;
716+
717+
if (auto It = ASNames.find(Suffix); It != ASNames.end())
718+
Result |= It->second;
719+
else
720+
diagnoseUnknownMMRAASName(MI, Suffix);
721+
}
722+
723+
return (Result != SIAtomicAddrSpace::NONE) ? Result : Default;
724+
}
725+
681726
} // end namespace anonymous
682727

683728
void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -2535,12 +2580,17 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
25352580
AtomicPseudoMIs.push_back(MI);
25362581
bool Changed = false;
25372582

2583+
// Refine fenced address space based on MMRAs.
2584+
//
2585+
// TODO: Should we support this MMRA on other atomic operations?
2586+
auto OrderingAddrSpace =
2587+
getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace());
2588+
25382589
if (MOI.isAtomic()) {
25392590
if (MOI.getOrdering() == AtomicOrdering::Acquire)
2540-
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2541-
SIMemOp::LOAD | SIMemOp::STORE,
2542-
MOI.getIsCrossAddressSpaceOrdering(),
2543-
Position::BEFORE);
2591+
Changed |= CC->insertWait(
2592+
MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2593+
MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE);
25442594

25452595
if (MOI.getOrdering() == AtomicOrdering::Release ||
25462596
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
@@ -2552,8 +2602,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
25522602
/// generate a fence. Could add support in this file for
25532603
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
25542604
/// adding S_WAITCNT before a S_BARRIER.
2555-
Changed |= CC->insertRelease(MI, MOI.getScope(),
2556-
MOI.getOrderingAddrSpace(),
2605+
Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
25572606
MOI.getIsCrossAddressSpaceOrdering(),
25582607
Position::BEFORE);
25592608

@@ -2565,8 +2614,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
25652614
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
25662615
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
25672616
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2568-
Changed |= CC->insertAcquire(MI, MOI.getScope(),
2569-
MOI.getOrderingAddrSpace(),
2617+
Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
25702618
Position::BEFORE);
25712619

25722620
return Changed;

0 commit comments

Comments
 (0)