Skip to content

Commit 36bcf92

Browse files
committed
[RFC][AMDGPU] Add OpenCL-specific fence address space masks
Using MMRAs, implement `builtin_amdgcn_fence_opencl` to allow device libs to emit fences that only target one or more address spaces, instead of fencing all address spaces at once.
1 parent 43d76ba commit 36bcf92

File tree

9 files changed

+3973
-9
lines changed

9 files changed

+3973
-9
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
6868
BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
6969
BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
7070
BUILTIN(__builtin_amdgcn_fence, "vUicC*", "n")
71+
BUILTIN(__builtin_amdgcn_fence_opencl, "vUiUicC*", "n")
7172
BUILTIN(__builtin_amdgcn_groupstaticsize, "Ui", "n")
7273
BUILTIN(__builtin_amdgcn_wavefrontsize, "Ui", "nc")
7374

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#include "llvm/IR/IntrinsicsX86.h"
5757
#include "llvm/IR/MDBuilder.h"
5858
#include "llvm/IR/MatrixBuilder.h"
59+
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
5960
#include "llvm/Support/ConvertUTF.h"
6061
#include "llvm/Support/MathExtras.h"
6162
#include "llvm/Support/ScopedPrinter.h"
@@ -18319,6 +18320,26 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
1831918320
return nullptr;
1832018321
}
1832118322

18323+
void CodeGenFunction::AddAMDGCNAddressSpaceMMRA(llvm::Instruction *Inst,
18324+
llvm::Value *ASMask) {
18325+
constexpr const char *Tag = "opencl-fence-mem";
18326+
18327+
uint64_t Mask = cast<llvm::ConstantInt>(ASMask)->getZExtValue();
18328+
if (Mask == 0)
18329+
return;
18330+
18331+
// 3 bits can be set: local, global, image in that order.
18332+
LLVMContext &Ctx = Inst->getContext();
18333+
SmallVector<MMRAMetadata::TagT, 3> MMRAs;
18334+
if (Mask & (1 << 0))
18335+
MMRAs.push_back({Tag, "local"});
18336+
if (Mask & (1 << 1))
18337+
MMRAs.push_back({Tag, "global"});
18338+
if (Mask & (1 << 2))
18339+
MMRAs.push_back({Tag, "image"});
18340+
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
18341+
}
18342+
1832218343
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1832318344
const CallExpr *E) {
1832418345
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -18991,6 +19012,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1899119012
EmitScalarExpr(E->getArg(1)), AO, SSID);
1899219013
return Builder.CreateFence(AO, SSID);
1899319014
}
19015+
case AMDGPU::BI__builtin_amdgcn_fence_opencl: {
19016+
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(1)),
19017+
EmitScalarExpr(E->getArg(2)), AO, SSID);
19018+
FenceInst *Fence = Builder.CreateFence(AO, SSID);
19019+
AddAMDGCNAddressSpaceMMRA(Fence, EmitScalarExpr(E->getArg(0)));
19020+
return Fence;
19021+
}
1899419022
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1899519023
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1899619024
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4542,6 +4542,8 @@ class CodeGenFunction : public CodeGenTypeCache {
45424542
llvm::Value *EmitHexagonBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
45434543
llvm::Value *EmitRISCVBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
45444544
ReturnValueSlot ReturnValue);
4545+
4546+
void AddAMDGCNAddressSpaceMMRA(llvm::Instruction *Inst, llvm::Value *ASMask);
45454547
void ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope,
45464548
llvm::AtomicOrdering &AO,
45474549
llvm::SyncScope::ID &SSID);

clang/lib/Sema/SemaChecking.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5681,6 +5681,10 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
56815681
OrderIndex = 0;
56825682
ScopeIndex = 1;
56835683
break;
5684+
case AMDGPU::BI__builtin_amdgcn_fence_opencl:
5685+
OrderIndex = 1;
5686+
ScopeIndex = 2;
5687+
break;
56845688
default:
56855689
return false;
56865690
}
@@ -5703,7 +5707,8 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
57035707
switch (static_cast<llvm::AtomicOrderingCABI>(Ord)) {
57045708
case llvm::AtomicOrderingCABI::relaxed:
57055709
case llvm::AtomicOrderingCABI::consume:
5706-
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_fence)
5710+
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_fence ||
5711+
BuiltinID == AMDGPU::BI__builtin_amdgcn_fence_opencl)
57075712
return Diag(ArgExpr->getBeginLoc(),
57085713
diag::warn_atomic_op_has_invalid_memory_order)
57095714
<< 0 << ArgExpr->getSourceRange();
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 3
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \
4+
// RUN: -triple=amdgcn-amd-amdhsa -Qn -mcode-object-version=none | FileCheck %s
5+
6+
#define LOCAL_MASK (1 << 0)
7+
#define GLOBAL_MASK (1 << 1)
8+
#define IMAGE_MASK (1 << 2)
9+
10+
//.
11+
// CHECK: @.str = private unnamed_addr addrspace(4) constant [10 x i8] c"workgroup\00", align 1
12+
// CHECK: @.str.1 = private unnamed_addr addrspace(4) constant [6 x i8] c"agent\00", align 1
13+
// CHECK: @.str.2 = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
14+
//.
15+
// CHECK-LABEL: define dso_local void @_Z10test_localv(
16+
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
17+
// CHECK-NEXT: entry:
18+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META1:![0-9]+]]
19+
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META1]]
20+
// CHECK-NEXT: fence seq_cst, !mmra [[META1]]
21+
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META1]]
22+
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META1]]
23+
// CHECK-NEXT: ret void
24+
//
25+
void test_local() {
26+
27+
__builtin_amdgcn_fence_opencl(LOCAL_MASK, __ATOMIC_SEQ_CST, "workgroup");
28+
29+
__builtin_amdgcn_fence_opencl(LOCAL_MASK,__ATOMIC_ACQUIRE, "agent");
30+
31+
__builtin_amdgcn_fence_opencl(LOCAL_MASK,__ATOMIC_SEQ_CST, "");
32+
33+
__builtin_amdgcn_fence_opencl(LOCAL_MASK, 4, "agent");
34+
35+
__builtin_amdgcn_fence_opencl(LOCAL_MASK, 3, "workgroup");
36+
}
37+
38+
// CHECK-LABEL: define dso_local void @_Z11test_globalv(
39+
// CHECK-SAME: ) #[[ATTR0]] {
40+
// CHECK-NEXT: entry:
41+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META2:![0-9]+]]
42+
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META2]]
43+
// CHECK-NEXT: fence seq_cst, !mmra [[META2]]
44+
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META2]]
45+
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META2]]
46+
// CHECK-NEXT: ret void
47+
//
48+
void test_global() {
49+
50+
__builtin_amdgcn_fence_opencl(GLOBAL_MASK, __ATOMIC_SEQ_CST, "workgroup");
51+
52+
__builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_ACQUIRE, "agent");
53+
54+
__builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_SEQ_CST, "");
55+
56+
__builtin_amdgcn_fence_opencl(GLOBAL_MASK, 4, "agent");
57+
58+
__builtin_amdgcn_fence_opencl(GLOBAL_MASK, 3, "workgroup");
59+
}
60+
61+
// CHECK-LABEL: define dso_local void @_Z10test_imagev(
62+
// CHECK-SAME: ) #[[ATTR0]] {
63+
// CHECK-NEXT: entry:
64+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]]
65+
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META3]]
66+
// CHECK-NEXT: fence seq_cst, !mmra [[META2]]
67+
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META3]]
68+
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META3]]
69+
// CHECK-NEXT: ret void
70+
//
71+
void test_image() {
72+
73+
__builtin_amdgcn_fence_opencl(IMAGE_MASK, __ATOMIC_SEQ_CST, "workgroup");
74+
75+
__builtin_amdgcn_fence_opencl(IMAGE_MASK,__ATOMIC_ACQUIRE, "agent");
76+
77+
__builtin_amdgcn_fence_opencl(GLOBAL_MASK,__ATOMIC_SEQ_CST, "");
78+
79+
__builtin_amdgcn_fence_opencl(IMAGE_MASK, 4, "agent");
80+
81+
__builtin_amdgcn_fence_opencl(IMAGE_MASK, 3, "workgroup");
82+
}
83+
84+
// CHECK-LABEL: define dso_local void @_Z10test_mixedv(
85+
// CHECK-SAME: ) #[[ATTR0]] {
86+
// CHECK-NEXT: entry:
87+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]]
88+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
89+
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META5]]
90+
// CHECK-NEXT: ret void
91+
//
92+
void test_mixed() {
93+
94+
__builtin_amdgcn_fence_opencl(IMAGE_MASK | GLOBAL_MASK, __ATOMIC_SEQ_CST, "workgroup");
95+
__builtin_amdgcn_fence_opencl(IMAGE_MASK | GLOBAL_MASK | LOCAL_MASK, __ATOMIC_SEQ_CST, "workgroup");
96+
97+
__builtin_amdgcn_fence_opencl(0xFF,__ATOMIC_SEQ_CST, "workgroup");
98+
}
99+
//.
100+
// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
101+
//.
102+
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
103+
// CHECK: [[META1]] = !{!"opencl-fence-mem", !"local"}
104+
// CHECK: [[META2]] = !{!"opencl-fence-mem", !"global"}
105+
// CHECK: [[META3]] = !{!"opencl-fence-mem", !"image"}
106+
// CHECK: [[META4]] = !{[[META2]], [[META3]]}
107+
// CHECK: [[META5]] = !{[[META1]], [[META2]], [[META3]]}
108+
//.

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "llvm/CodeGen/MachineBasicBlock.h"
2222
#include "llvm/CodeGen/MachineFunctionPass.h"
2323
#include "llvm/IR/DiagnosticInfo.h"
24+
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
2425
#include "llvm/Support/AtomicOrdering.h"
2526
#include "llvm/TargetParser/TargetParser.h"
2627

@@ -2535,12 +2536,29 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
25352536
AtomicPseudoMIs.push_back(MI);
25362537
bool Changed = false;
25372538

2539+
// Refine based on MMRAs. They can override the OrderingAddrSpace
2540+
auto OrderingAddrSpace = MOI.getOrderingAddrSpace();
2541+
2542+
// TODO: Use an enum/parse this sooner?
2543+
// TODO: Do we need to handle these MMRAs on load/stores/atomicrmw as well?
2544+
if (auto MMRA = MMRAMetadata(MI->getMMRAMetadata())) {
2545+
SIAtomicAddrSpace NewAddrSpace = SIAtomicAddrSpace::NONE;
2546+
if (MMRA.hasTag("opencl-fence-mem", "global"))
2547+
NewAddrSpace |= SIAtomicAddrSpace::GLOBAL;
2548+
if (MMRA.hasTag("opencl-fence-mem", "local"))
2549+
NewAddrSpace |= SIAtomicAddrSpace::LDS;
2550+
if (MMRA.hasTag("opencl-fence-mem", "image"))
2551+
NewAddrSpace |= SIAtomicAddrSpace::SCRATCH;
2552+
2553+
if (NewAddrSpace != SIAtomicAddrSpace::NONE)
2554+
OrderingAddrSpace = NewAddrSpace;
2555+
}
2556+
25382557
if (MOI.isAtomic()) {
25392558
if (MOI.getOrdering() == AtomicOrdering::Acquire)
2540-
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2541-
SIMemOp::LOAD | SIMemOp::STORE,
2542-
MOI.getIsCrossAddressSpaceOrdering(),
2543-
Position::BEFORE);
2559+
Changed |= CC->insertWait(
2560+
MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2561+
MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE);
25442562

25452563
if (MOI.getOrdering() == AtomicOrdering::Release ||
25462564
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
@@ -2552,8 +2570,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
25522570
/// generate a fence. Could add support in this file for
25532571
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
25542572
/// adding S_WAITCNT before a S_BARRIER.
2555-
Changed |= CC->insertRelease(MI, MOI.getScope(),
2556-
MOI.getOrderingAddrSpace(),
2573+
Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
25572574
MOI.getIsCrossAddressSpaceOrdering(),
25582575
Position::BEFORE);
25592576

@@ -2565,8 +2582,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
25652582
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
25662583
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
25672584
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2568-
Changed |= CC->insertAcquire(MI, MOI.getScope(),
2569-
MOI.getOrderingAddrSpace(),
2585+
Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
25702586
Position::BEFORE);
25712587

25722588
return Changed;

0 commit comments

Comments
 (0)