Skip to content

Commit 7d73a7a

Browse files
committed
[NVPTX] Load/Store syncscope support
1 parent 7c188ab commit 7d73a7a

14 files changed

+5259
-731
lines changed

llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -233,46 +233,68 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
233233
auto Ordering = NVPTX::Ordering(Imm);
234234
switch (Ordering) {
235235
case NVPTX::Ordering::NotAtomic:
236-
break;
237-
case NVPTX::Ordering::Volatile:
238-
O << ".volatile";
239-
break;
236+
return;
240237
case NVPTX::Ordering::Relaxed:
241-
O << ".relaxed.sys";
242-
break;
238+
O << ".relaxed";
239+
return;
243240
case NVPTX::Ordering::Acquire:
244-
O << ".acquire.sys";
245-
break;
241+
O << ".acquire";
242+
return;
246243
case NVPTX::Ordering::Release:
247-
O << ".release.sys";
248-
break;
244+
O << ".release";
245+
return;
246+
case NVPTX::Ordering::Volatile:
247+
O << ".volatile";
248+
return;
249249
case NVPTX::Ordering::RelaxedMMIO:
250-
O << ".mmio.relaxed.sys";
251-
break;
250+
O << ".mmio.relaxed";
251+
return;
252252
default:
253253
report_fatal_error(formatv(
254-
"NVPTX LdStCode Printer does not support \"{}\" sem modifier.",
255-
OrderingToCString(Ordering)));
254+
"NVPTX LdStCode Printer does not support \"{}\" sem modifier. "
255+
"Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
256+
OrderingToString(Ordering)));
257+
}
258+
} else if (!strcmp(Modifier, "sco")) {
259+
auto S = NVPTX::Scope(Imm);
260+
switch (S) {
261+
case NVPTX::Scope::Thread:
262+
return;
263+
case NVPTX::Scope::System:
264+
O << ".sys";
265+
return;
266+
case NVPTX::Scope::Block:
267+
O << ".cta";
268+
return;
269+
case NVPTX::Scope::Cluster:
270+
O << ".cluster";
271+
return;
272+
case NVPTX::Scope::Device:
273+
O << ".gpu";
274+
return;
256275
}
276+
report_fatal_error(formatv(
277+
"NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
278+
ScopeToString(S)));
257279
} else if (!strcmp(Modifier, "addsp")) {
258280
switch (Imm) {
259281
case NVPTX::PTXLdStInstCode::GLOBAL:
260282
O << ".global";
261-
break;
283+
return;
262284
case NVPTX::PTXLdStInstCode::SHARED:
263285
O << ".shared";
264-
break;
286+
return;
265287
case NVPTX::PTXLdStInstCode::LOCAL:
266288
O << ".local";
267-
break;
289+
return;
268290
case NVPTX::PTXLdStInstCode::PARAM:
269291
O << ".param";
270-
break;
292+
return;
271293
case NVPTX::PTXLdStInstCode::CONSTANT:
272294
O << ".const";
273-
break;
295+
return;
274296
case NVPTX::PTXLdStInstCode::GENERIC:
275-
break;
297+
return;
276298
default:
277299
llvm_unreachable("Wrong Address Space");
278300
}

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,22 @@ enum Ordering : OrderingUnderlyingType {
117117
// Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire"
118118
Acquire = (OrderingUnderlyingType)AtomicOrdering::Acquire,
119119
Release = (OrderingUnderlyingType)AtomicOrdering::Release,
120-
// AcquireRelease = 6, // TODO
120+
AcquireRelease = (OrderingUnderlyingType)AtomicOrdering::AcquireRelease,
121121
SequentiallyConsistent =
122122
(OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent,
123123
Volatile = SequentiallyConsistent + 1,
124124
RelaxedMMIO = Volatile + 1,
125-
LAST = RelaxedMMIO
125+
LASTORDERING = RelaxedMMIO
126+
};
127+
128+
using ScopeUnderlyingType = unsigned int;
129+
enum Scope : ScopeUnderlyingType {
130+
Thread = 0,
131+
System = 1,
132+
Block = 2,
133+
Cluster = 3,
134+
Device = 4,
135+
LASTSCOPE = Device
126136
};
127137

128138
namespace PTXLdStInstCode {

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 156 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
5959

6060
bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
6161
Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
62+
Scopes = NVPTXScopes(MF.getFunction().getContext());
6263
return SelectionDAGISel::runOnMachineFunction(MF);
6364
}
6465

@@ -106,6 +107,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
106107
if (tryStore(N))
107108
return;
108109
break;
110+
case ISD::ATOMIC_FENCE:
111+
if (tryFence(N))
112+
return;
113+
break;
109114
case ISD::EXTRACT_VECTOR_ELT:
110115
if (tryEXTRACT_VECTOR_ELEMENT(N))
111116
return;
@@ -915,6 +920,42 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
915920

916921
} // namespace
917922

923+
NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
924+
NVPTX::Ordering Ord) const {
925+
switch (Ord) {
926+
case NVPTX::Ordering::NotAtomic:
927+
case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
928+
// NVPTX uses Thread scope as the scope of non-atomic operations.
929+
return NVPTX::Scope::Thread;
930+
case NVPTX::Ordering::RelaxedMMIO:
931+
// RelaxedMMIO operations are always system scope.
932+
// If a RelaxedMMIO order was generated from an atomic volatile operation
933+
// with a smaller thread scope, we bump it here to system scope.
934+
return NVPTX::Scope::System;
935+
case NVPTX::Ordering::Relaxed:
936+
case NVPTX::Ordering::Acquire:
937+
case NVPTX::Ordering::Release:
938+
case NVPTX::Ordering::AcquireRelease:
939+
case NVPTX::Ordering::SequentiallyConsistent:
940+
auto S = Scopes[N->getSyncScopeID()];
941+
942+
// Atomic operations must have a scope greater than thread.
943+
if (S == NVPTX::Scope::Thread)
944+
report_fatal_error(
945+
formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
946+
947+
// If scope is cluster, clusters must be supported.
948+
if (S == NVPTX::Scope::Cluster)
949+
Subtarget->requireClusters("cluster scope");
950+
951+
// If operation is volatile, then its scope is system.
952+
if (N->isVolatile())
953+
S = NVPTX::Scope::System;
954+
955+
return S;
956+
}
957+
}
958+
918959
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
919960
unsigned CodeAddrSpace, MachineFunction *F) {
920961
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -957,33 +998,86 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
957998
});
958999
}
9591000

960-
NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL,
961-
SDValue &Chain,
962-
MemSDNode *N) {
1001+
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
1002+
NVPTXSubtarget const *T) {
1003+
if (S == NVPTX::Scope::Cluster)
1004+
T->requireClusters(".cluster scope fence");
1005+
1006+
switch (O) {
1007+
case NVPTX::Ordering::Acquire:
1008+
case NVPTX::Ordering::Release:
1009+
case NVPTX::Ordering::AcquireRelease: {
1010+
switch (S) {
1011+
case NVPTX::Scope::System:
1012+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
1013+
: NVPTX::INT_MEMBAR_SYS;
1014+
case NVPTX::Scope::Block:
1015+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
1016+
: NVPTX::INT_MEMBAR_CTA;
1017+
case NVPTX::Scope::Cluster:
1018+
return NVPTX::atomic_thread_fence_acq_rel_cluster;
1019+
case NVPTX::Scope::Device:
1020+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
1021+
: NVPTX::INT_MEMBAR_GL;
1022+
case NVPTX::Scope::Thread:
1023+
report_fatal_error(
1024+
formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
1025+
ScopeToString(S)));
1026+
}
1027+
}
1028+
case NVPTX::Ordering::SequentiallyConsistent: {
1029+
switch (S) {
1030+
case NVPTX::Scope::System:
1031+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
1032+
: NVPTX::INT_MEMBAR_SYS;
1033+
case NVPTX::Scope::Block:
1034+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
1035+
: NVPTX::INT_MEMBAR_CTA;
1036+
case NVPTX::Scope::Cluster:
1037+
return NVPTX::atomic_thread_fence_seq_cst_cluster;
1038+
case NVPTX::Scope::Device:
1039+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
1040+
: NVPTX::INT_MEMBAR_GL;
1041+
case NVPTX::Scope::Thread:
1042+
report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
1043+
ScopeToString(S)));
1044+
}
1045+
}
1046+
case NVPTX::Ordering::NotAtomic:
1047+
case NVPTX::Ordering::Relaxed:
1048+
case NVPTX::Ordering::Volatile:
1049+
case NVPTX::Ordering::RelaxedMMIO:
1050+
report_fatal_error(
1051+
formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
1052+
OrderingToString(O), ScopeToString(S)));
1053+
}
1054+
}
1055+
1056+
std::pair<NVPTX::Ordering, NVPTX::Scope>
1057+
NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
1058+
MemSDNode *N) {
9631059
// Some memory instructions - loads, stores, atomics - need an extra fence
9641060
// instruction. Get the memory order of the instruction, and that of its
9651061
// fence, if any.
9661062
auto [InstructionOrdering, FenceOrdering] =
9671063
getOperationOrderings(N, Subtarget);
1064+
auto Scope = getOperationScope(N, InstructionOrdering);
9681065

9691066
// If a fence is required before the operation, insert it:
9701067
switch (NVPTX::Ordering(FenceOrdering)) {
9711068
case NVPTX::Ordering::NotAtomic:
9721069
break;
9731070
case NVPTX::Ordering::SequentiallyConsistent: {
974-
unsigned Op = Subtarget->hasMemoryOrdering()
975-
? NVPTX::atomic_thread_fence_seq_cst_sys
976-
: NVPTX::INT_MEMBAR_SYS;
1071+
auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
9771072
Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
9781073
break;
9791074
}
9801075
default:
9811076
report_fatal_error(
9821077
formatv("Unexpected fence ordering: \"{}\".",
983-
OrderingToCString(NVPTX::Ordering(FenceOrdering))));
1078+
OrderingToString(NVPTX::Ordering(FenceOrdering))));
9841079
}
985-
986-
return InstructionOrdering;
1080+
return std::make_pair(InstructionOrdering, Scope);
9871081
}
9881082

9891083
bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
@@ -1154,7 +1248,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11541248

11551249
SDLoc DL(N);
11561250
SDValue Chain = N->getOperand(0);
1157-
auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD);
1251+
auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
11581252

11591253
// Type Setting: fromType + fromTypeWidth
11601254
//
@@ -1189,7 +1283,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11891283
std::optional<unsigned> Opcode;
11901284
MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
11911285

1192-
SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL),
1286+
SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
11931287
getI32Imm(CodeAddrSpace, DL),
11941288
getI32Imm(VecType, DL), getI32Imm(FromType, DL),
11951289
getI32Imm(FromTypeWidth, DL)});
@@ -1266,7 +1360,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
12661360

12671361
SDLoc DL(N);
12681362
SDValue Chain = N->getOperand(0);
1269-
auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
1363+
auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
12701364

12711365
// Vector Setting
12721366
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1319,7 +1413,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
13191413
std::optional<unsigned> Opcode;
13201414
SDNode *LD;
13211415

1322-
SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL),
1416+
SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
13231417
getI32Imm(CodeAddrSpace, DL),
13241418
getI32Imm(VecType, DL), getI32Imm(FromType, DL),
13251419
getI32Imm(FromTypeWidth, DL)});
@@ -1895,7 +1989,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
18951989

18961990
SDLoc DL(N);
18971991
SDValue Chain = ST->getChain();
1898-
auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST);
1992+
auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
18991993

19001994
// Vector Setting
19011995
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1923,10 +2017,10 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19232017
MVT::SimpleValueType SourceVT =
19242018
Value.getNode()->getSimpleValueType(0).SimpleTy;
19252019

1926-
SmallVector<SDValue, 12> Ops({Value, getI32Imm(InstructionOrdering, DL),
1927-
getI32Imm(CodeAddrSpace, DL),
1928-
getI32Imm(VecType, DL), getI32Imm(ToType, DL),
1929-
getI32Imm(ToTypeWidth, DL)});
2020+
SmallVector<SDValue, 12> Ops(
2021+
{Value, getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
2022+
getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
2023+
getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
19302024

19312025
if (SelectDirectAddr(BasePtr, Addr)) {
19322026
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
@@ -2005,7 +2099,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
20052099

20062100
SDLoc DL(N);
20072101
SDValue Chain = N->getOperand(0);
2008-
auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
2102+
auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
20092103

20102104
// Type Setting: toType + toTypeWidth
20112105
// - for integer type, always use 'u'
@@ -2044,9 +2138,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
20442138
ToTypeWidth = 32;
20452139
}
20462140

2047-
Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL),
2048-
getI32Imm(VecType, DL), getI32Imm(ToType, DL),
2049-
getI32Imm(ToTypeWidth, DL)});
2141+
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
2142+
getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
2143+
getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
20502144

20512145
if (SelectDirectAddr(N2, Addr)) {
20522146
switch (N->getOpcode()) {
@@ -4064,3 +4158,43 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
40644158
}
40654159
}
40664160
}
4161+
4162+
bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
4163+
SDLoc DL(N);
4164+
assert(N->getOpcode() == ISD::ATOMIC_FENCE);
4165+
unsigned int FenceOp =
4166+
getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
4167+
Scopes[N->getConstantOperandVal(2)], Subtarget);
4168+
SDValue Chain = N->getOperand(0);
4169+
SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
4170+
ReplaceNode(N, FenceNode);
4171+
return true;
4172+
}
4173+
4174+
NVPTXScopes::NVPTXScopes(LLVMContext &C) : CTX(&C) {
4175+
Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
4176+
Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
4177+
Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
4178+
Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
4179+
Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
4180+
}
4181+
4182+
NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
4183+
if (Scopes.empty())
4184+
report_fatal_error("NVPTX Scopes must be initialized before calling "
4185+
"NVPTXScopes::operator[]");
4186+
4187+
auto S = Scopes.find(ID);
4188+
if (S == Scopes.end()) {
4189+
SmallVector<StringRef, 8> ScopeNames;
4190+
assert(CTX != nullptr && "CTX is nullptr");
4191+
CTX->getSyncScopeNames(ScopeNames);
4192+
StringRef Unknown{"unknown"};
4193+
auto Name = ID < ScopeNames.size() ? ScopeNames[ID] : Unknown;
4194+
report_fatal_error(
4195+
formatv("Could not find scope ID={} with name \"{}\".", int(ID), Name));
4196+
}
4197+
return S->second;
4198+
}
4199+
4200+
bool NVPTXScopes::empty() const { return Scopes.size() == 0; }

0 commit comments

Comments
 (0)