-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Add amdgpu-as MMRA for fences #78572
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a2aec99
9191488
eb7d410
8a490d0
eade534
c6ba3da
a05d4cd
5fe2210
cd7df2f
e18a663
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,7 @@ | |
#include "llvm/IR/IntrinsicsX86.h" | ||
#include "llvm/IR/MDBuilder.h" | ||
#include "llvm/IR/MatrixBuilder.h" | ||
#include "llvm/IR/MemoryModelRelaxationAnnotations.h" | ||
#include "llvm/Support/ConvertUTF.h" | ||
#include "llvm/Support/MathExtras.h" | ||
#include "llvm/Support/ScopedPrinter.h" | ||
|
@@ -18327,6 +18328,29 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, | |
return nullptr; | ||
} | ||
|
||
void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, | ||
const CallExpr *E) { | ||
constexpr const char *Tag = "amdgpu-as"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just bikeshedding a bit, but do we really need the "amdgpu" prefix on the tag? Clang will only generate these for AMDGPU anyway. It's not a blocker, but feels like we are being cautious for no reason. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's just more readable this way, but I have no strong preference either. |
||
|
||
LLVMContext &Ctx = Inst->getContext(); | ||
SmallVector<MMRAMetadata::TagT, 3> MMRAs; | ||
for (unsigned K = 2; K < E->getNumArgs(); ++K) { | ||
llvm::Value *V = EmitScalarExpr(E->getArg(K)); | ||
StringRef AS; | ||
if (llvm::getConstantStringInfo(V, AS)) { | ||
MMRAs.push_back({Tag, AS}); | ||
// TODO: Delete the resulting unused constant? | ||
continue; | ||
} | ||
CGM.Error(E->getExprLoc(), | ||
"expected an address space name as a string literal"); | ||
} | ||
|
||
llvm::sort(MMRAs); | ||
MMRAs.erase(llvm::unique(MMRAs), MMRAs.end()); | ||
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); | ||
} | ||
|
||
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, | ||
const CallExpr *E) { | ||
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; | ||
|
@@ -18997,7 +19021,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, | |
case AMDGPU::BI__builtin_amdgcn_fence: { | ||
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)), | ||
EmitScalarExpr(E->getArg(1)), AO, SSID); | ||
return Builder.CreateFence(AO, SSID); | ||
FenceInst *Fence = Builder.CreateFence(AO, SSID); | ||
if (E->getNumArgs() > 2) | ||
AddAMDGPUFenceAddressSpaceMMRA(Fence, E); | ||
return Fence; | ||
} | ||
case AMDGPU::BI__builtin_amdgcn_atomic_inc32: | ||
case AMDGPU::BI__builtin_amdgcn_atomic_inc64: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,111 @@ | ||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 | ||
// REQUIRES: amdgpu-registered-target | ||
// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \ | ||
// RUN: -triple=amdgcn-amd-amdhsa | opt -S | FileCheck %s | ||
// RUN: -triple=amdgcn-amd-amdhsa | FileCheck %s | ||
|
||
// CHECK-LABEL: define dso_local void @_Z25test_memory_fence_successv( | ||
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: fence syncscope("workgroup") seq_cst | ||
// CHECK-NEXT: fence syncscope("agent") acquire | ||
// CHECK-NEXT: fence seq_cst | ||
// CHECK-NEXT: fence syncscope("agent") acq_rel | ||
// CHECK-NEXT: fence syncscope("workgroup") release | ||
// CHECK-NEXT: ret void | ||
// | ||
void test_memory_fence_success() { | ||
// CHECK-LABEL: test_memory_fence_success | ||
|
||
// CHECK: fence syncscope("workgroup") seq_cst | ||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup"); | ||
|
||
// CHECK: fence syncscope("agent") acquire | ||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); | ||
|
||
// CHECK: fence seq_cst | ||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, ""); | ||
|
||
// CHECK: fence syncscope("agent") acq_rel | ||
__builtin_amdgcn_fence(4, "agent"); | ||
|
||
// CHECK: fence syncscope("workgroup") release | ||
__builtin_amdgcn_fence(3, "workgroup"); | ||
} | ||
|
||
// CHECK-LABEL: define dso_local void @_Z10test_localv( | ||
// CHECK-SAME: ) #[[ATTR0]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]] | ||
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META3]] | ||
// CHECK-NEXT: fence seq_cst, !mmra [[META3]] | ||
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META3]] | ||
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META3]] | ||
// CHECK-NEXT: ret void | ||
// | ||
void test_local() { | ||
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local"); | ||
|
||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent", "local"); | ||
|
||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "", "local"); | ||
|
||
__builtin_amdgcn_fence(4, "agent", "local"); | ||
|
||
__builtin_amdgcn_fence(3, "workgroup", "local"); | ||
} | ||
|
||
|
||
// CHECK-LABEL: define dso_local void @_Z11test_globalv( | ||
// CHECK-SAME: ) #[[ATTR0]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]] | ||
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META4]] | ||
// CHECK-NEXT: fence seq_cst, !mmra [[META4]] | ||
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META4]] | ||
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META4]] | ||
// CHECK-NEXT: ret void | ||
// | ||
void test_global() { | ||
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "global"); | ||
|
||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent", "global"); | ||
|
||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "", "global"); | ||
|
||
__builtin_amdgcn_fence(4, "agent", "global"); | ||
|
||
__builtin_amdgcn_fence(3, "workgroup", "global"); | ||
} | ||
|
||
// CHECK-LABEL: define dso_local void @_Z10test_imagev( | ||
// CHECK-SAME: ) #[[ATTR0]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META3]] | ||
// CHECK-NEXT: fence syncscope("agent") acquire, !mmra [[META3]] | ||
// CHECK-NEXT: fence seq_cst, !mmra [[META3]] | ||
// CHECK-NEXT: fence syncscope("agent") acq_rel, !mmra [[META3]] | ||
// CHECK-NEXT: fence syncscope("workgroup") release, !mmra [[META3]] | ||
// CHECK-NEXT: ret void | ||
// | ||
void test_image() { | ||
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local"); | ||
|
||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent", "local"); | ||
|
||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "", "local"); | ||
|
||
__builtin_amdgcn_fence(4, "agent", "local"); | ||
|
||
__builtin_amdgcn_fence(3, "workgroup", "local"); | ||
} | ||
|
||
// CHECK-LABEL: define dso_local void @_Z10test_mixedv( | ||
// CHECK-SAME: ) #[[ATTR0]] { | ||
// CHECK-NEXT: entry: | ||
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]] | ||
// CHECK-NEXT: fence syncscope("workgroup") seq_cst, !mmra [[META5]] | ||
// CHECK-NEXT: ret void | ||
// | ||
void test_mixed() { | ||
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local", "global"); | ||
__builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local", "local", "global", "local", "local"); | ||
} | ||
Pierre-vh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
//. | ||
// CHECK: [[META3]] = !{!"amdgpu-as", !"local"} | ||
// CHECK: [[META4]] = !{!"amdgpu-as", !"global"} | ||
// CHECK: [[META5]] = !{[[META4]], [[META3]]} | ||
//. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,9 +18,11 @@ | |
#include "GCNSubtarget.h" | ||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h" | ||
#include "llvm/ADT/BitmaskEnum.h" | ||
#include "llvm/ADT/StringExtras.h" | ||
#include "llvm/CodeGen/MachineBasicBlock.h" | ||
#include "llvm/CodeGen/MachineFunctionPass.h" | ||
#include "llvm/IR/DiagnosticInfo.h" | ||
#include "llvm/IR/MemoryModelRelaxationAnnotations.h" | ||
#include "llvm/Support/AtomicOrdering.h" | ||
#include "llvm/TargetParser/TargetParser.h" | ||
|
||
|
@@ -678,6 +680,49 @@ class SIMemoryLegalizer final : public MachineFunctionPass { | |
bool runOnMachineFunction(MachineFunction &MF) override; | ||
}; | ||
|
||
static const StringMap<SIAtomicAddrSpace> ASNames = {{ | ||
{"global", SIAtomicAddrSpace::GLOBAL}, | ||
{"local", SIAtomicAddrSpace::LDS}, | ||
}}; | ||
|
||
void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) { | ||
const MachineFunction *MF = MI.getMF(); | ||
const Function &Fn = MF->getFunction(); | ||
SmallString<128> Str; | ||
raw_svector_ostream OS(Str); | ||
OS << "unknown address space '" << AS << "'; expected one of "; | ||
ListSeparator LS; | ||
for (const auto &[Name, Val] : ASNames) | ||
OS << LS << '\'' << Name << '\''; | ||
DiagnosticInfoUnsupported BadTag(Fn, Str.str(), MI.getDebugLoc(), DS_Warning); | ||
Fn.getContext().diagnose(BadTag); | ||
} | ||
|
||
/// Reads \p MI's MMRAs to parse the "amdgpu-as" MMRA. | ||
/// If this tag isn't present, or if it has no meaningful values, returns \p | ||
/// Default. Otherwise returns all the address spaces concerned by the MMRA. | ||
static SIAtomicAddrSpace getFenceAddrSpaceMMRA(const MachineInstr &MI, | ||
SIAtomicAddrSpace Default) { | ||
static constexpr StringLiteral FenceASPrefix = "amdgpu-as"; | ||
|
||
auto MMRA = MMRAMetadata(MI.getMMRAMetadata()); | ||
if (!MMRA) | ||
return Default; | ||
|
||
SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE; | ||
for (const auto &[Prefix, Suffix] : MMRA) { | ||
if (Prefix != FenceASPrefix) | ||
continue; | ||
|
||
if (auto It = ASNames.find(Suffix); It != ASNames.end()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow, I have never considered using a semicolon like this before. But it makes so much sense! :) |
||
Result |= It->second; | ||
else | ||
diagnoseUnknownMMRAASName(MI, Suffix); | ||
} | ||
|
||
return (Result != SIAtomicAddrSpace::NONE) ? Result : Default; | ||
} | ||
|
||
} // end namespace anonymous | ||
|
||
void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, | ||
|
@@ -2535,12 +2580,17 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, | |
AtomicPseudoMIs.push_back(MI); | ||
bool Changed = false; | ||
|
||
// Refine fenced address space based on MMRAs. | ||
// | ||
// TODO: Should we support this MMRA on other atomic operations? | ||
auto OrderingAddrSpace = | ||
getFenceAddrSpaceMMRA(*MI, MOI.getOrderingAddrSpace()); | ||
|
||
if (MOI.isAtomic()) { | ||
if (MOI.getOrdering() == AtomicOrdering::Acquire) | ||
Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), | ||
SIMemOp::LOAD | SIMemOp::STORE, | ||
MOI.getIsCrossAddressSpaceOrdering(), | ||
Position::BEFORE); | ||
Changed |= CC->insertWait( | ||
MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE, | ||
MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); | ||
|
||
if (MOI.getOrdering() == AtomicOrdering::Release || | ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease || | ||
|
@@ -2552,8 +2602,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, | |
/// generate a fence. Could add support in this file for | ||
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally | ||
/// adding S_WAITCNT before a S_BARRIER. | ||
Changed |= CC->insertRelease(MI, MOI.getScope(), | ||
MOI.getOrderingAddrSpace(), | ||
Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace, | ||
MOI.getIsCrossAddressSpaceOrdering(), | ||
Position::BEFORE); | ||
|
||
|
@@ -2565,8 +2614,7 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, | |
if (MOI.getOrdering() == AtomicOrdering::Acquire || | ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease || | ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) | ||
Changed |= CC->insertAcquire(MI, MOI.getScope(), | ||
MOI.getOrderingAddrSpace(), | ||
Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace, | ||
Position::BEFORE); | ||
|
||
return Changed; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We randomly change between HSA and OpenCL terminology. Maybe we should call "local" "groupsegment"? I guess the ISA manuals call it "local data share"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wasusing the OpenCL terminology: https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/atomic_work_item_fence.html
Thinking about it, maybe it's better to use names consistent with our AMDGPUAS definitions, so image would just be "private" but local/global stay. What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure we can get away without image
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just for internal use right now, so we can use anything we want. Device libs can then map the IMAGE flag to a "private" operand here.