Skip to content

Commit 98449e9

Browse files
committed
[NVPTX] Volta SequentiallyConsistent Load/Store Ops
1 parent 657ba9c commit 98449e9

File tree

5 files changed

+210
-101
lines changed

5 files changed

+210
-101
lines changed

llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,16 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
254254
report_fatal_error(OS.str());
255255
break;
256256
}
257+
} else if (!strcmp(Modifier, "sc")) {
258+
switch (Imm) {
259+
// TODO: refactor fence insertion in ISelDagToDag instead of here
260+
// as part of implementing atomicrmw seq_cst.
261+
case NVPTX::PTXLdStInstCode::SeqCstFence:
262+
O << "fence.sc.sys;\n\t";
263+
break;
264+
default:
265+
break;
266+
}
257267
} else if (!strcmp(Modifier, "addsp")) {
258268
switch (Imm) {
259269
case NVPTX::PTXLdStInstCode::GLOBAL:

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@ enum MemorySemantic {
113113
Relaxed = 2,
114114
Acquire = 3,
115115
Release = 4,
116-
RelaxedMMIO = 5
116+
RelaxedMMIO = 5,
117+
SeqCstFence = 6,
117118
};
118119
enum AddressSpace {
119120
GENERIC = 0,

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 71 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -714,21 +714,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
714714
return NVPTX::PTXLdStInstCode::GENERIC;
715715
}
716716

717-
static unsigned int getCodeMemorySemantic(MemSDNode *N,
718-
const NVPTXSubtarget *Subtarget) {
717+
struct MemorySemantic {
718+
unsigned int sem = -1;
719+
unsigned int sc_fence = -1;
720+
MemorySemantic(unsigned int s) : sem(s) {}
721+
MemorySemantic(unsigned int s, unsigned int f) : sem(s), sc_fence(f) {}
722+
};
723+
724+
static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
725+
const NVPTXSubtarget *Subtarget) {
719726
AtomicOrdering Ordering = N->getSuccessOrdering();
720727
auto CodeAddrSpace = getCodeAddrSpace(N);
721728

722729
bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
723730
bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
724731

725-
// TODO: lowering for SequentiallyConsistent Operations: for now, we error.
726-
// TODO: lowering for AcquireRelease Operations: for now, we error.
727-
//
728-
729732
// clang-format off
730733

731-
// Lowering for non-SequentiallyConsistent Operations
734+
// Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
732735
//
733736
// | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
734737
// |---------|----------|--------------------|------------|------------------------------|
@@ -749,6 +752,18 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
749752
// | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
750753
// | | | / Global [0] | | |
751754

755+
// Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
756+
// by following the ABI proven sound in:
757+
// Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
758+
// https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
759+
//
760+
// | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
761+
// |-----------------------------------------------------------------------------|-----------------------------------------|
762+
// | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; |
763+
// | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; ld.acquire.<scope>; |
764+
// | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; st.release.<scope>; |
765+
// | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; atom.acq_rel.<scope>; |
766+
752767
// clang-format on
753768

754769
// [0]: volatile and atomics are only supported on global or shared
@@ -788,7 +803,6 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
788803
// - the "weak" memory instruction we are currently lowering to, and
789804
// - some other instruction that preserves the side-effect, e.g.,
790805
// a dead dummy volatile load.
791-
792806
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
793807
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
794808
CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
@@ -870,16 +884,32 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
870884
N->print(OS);
871885
report_fatal_error(OS.str());
872886
}
873-
case AtomicOrdering::SequentiallyConsistent:
874-
// TODO: support AcquireRelease and SequentiallyConsistent
875-
SmallString<256> Msg;
876-
raw_svector_ostream OS(Msg);
877-
OS << "NVPTX backend does not support AtomicOrdering \""
878-
<< toIRString(Ordering) << "\" yet.";
879-
report_fatal_error(OS.str());
887+
case AtomicOrdering::SequentiallyConsistent: {
888+
unsigned int sem;
889+
if (N->readMem()) {
890+
sem = NVPTX::PTXLdStInstCode::Acquire;
891+
} else if (N->writeMem()) {
892+
sem = NVPTX::PTXLdStInstCode::Release;
893+
} else {
894+
SmallString<256> Msg;
895+
raw_svector_ostream OS(Msg);
896+
OS << "NVPTX does not support SequentiallyConsistent Ordering on "
897+
"read-modify-writes yet: "
898+
<< N->getOperationName();
899+
N->print(OS);
900+
report_fatal_error(OS.str());
901+
}
902+
return addrGenericOrGlobalOrShared
903+
? MemorySemantic(sem, NVPTX::PTXLdStInstCode::SeqCstFence)
904+
: MemorySemantic(NVPTX::PTXLdStInstCode::NotAtomic);
905+
}
880906
}
881907

882-
llvm_unreachable("unexpected unhandled case");
908+
SmallString<256> Msg;
909+
raw_svector_ostream OS(Msg);
910+
OS << "NVPTX backend does not support AtomicOrdering \""
911+
<< toIRString(Ordering) << "\" yet.";
912+
report_fatal_error(OS.str());
883913
}
884914

885915
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
@@ -1091,7 +1121,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
10911121
}
10921122

10931123
// Memory Semantic Setting
1094-
unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
1124+
auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(LD, Subtarget);
10951125

10961126
unsigned int PointerSize =
10971127
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
@@ -1136,7 +1166,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11361166
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
11371167
if (!Opcode)
11381168
return false;
1139-
SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1169+
SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
1170+
getI32Imm(CodeMemorySem, dl),
11401171
getI32Imm(CodeAddrSpace, dl),
11411172
getI32Imm(vecType, dl),
11421173
getI32Imm(fromType, dl),
@@ -1151,7 +1182,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11511182
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
11521183
if (!Opcode)
11531184
return false;
1154-
SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1185+
SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
1186+
getI32Imm(CodeMemorySem, dl),
11551187
getI32Imm(CodeAddrSpace, dl),
11561188
getI32Imm(vecType, dl),
11571189
getI32Imm(fromType, dl),
@@ -1173,7 +1205,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11731205
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
11741206
if (!Opcode)
11751207
return false;
1176-
SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1208+
SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
1209+
getI32Imm(CodeMemorySem, dl),
11771210
getI32Imm(CodeAddrSpace, dl),
11781211
getI32Imm(vecType, dl),
11791212
getI32Imm(fromType, dl),
@@ -1194,7 +1227,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11941227
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
11951228
if (!Opcode)
11961229
return false;
1197-
SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1230+
SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
1231+
getI32Imm(CodeMemorySem, dl),
11981232
getI32Imm(CodeAddrSpace, dl),
11991233
getI32Imm(vecType, dl),
12001234
getI32Imm(fromType, dl),
@@ -1238,7 +1272,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
12381272
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
12391273

12401274
// Memory Semantic Setting
1241-
unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
1275+
auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
12421276

12431277
// Vector Setting
12441278
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1305,7 +1339,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
13051339
}
13061340
if (!Opcode)
13071341
return false;
1308-
SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1342+
SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
1343+
getI32Imm(CodeMemorySem, DL),
13091344
getI32Imm(CodeAddrSpace, DL),
13101345
getI32Imm(VecType, DL),
13111346
getI32Imm(FromType, DL),
@@ -1334,7 +1369,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
13341369
}
13351370
if (!Opcode)
13361371
return false;
1337-
SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1372+
SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
1373+
getI32Imm(CodeMemorySem, DL),
13381374
getI32Imm(CodeAddrSpace, DL),
13391375
getI32Imm(VecType, DL),
13401376
getI32Imm(FromType, DL),
@@ -1384,7 +1420,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
13841420
}
13851421
if (!Opcode)
13861422
return false;
1387-
SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1423+
SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
1424+
getI32Imm(CodeMemorySem, DL),
13881425
getI32Imm(CodeAddrSpace, DL),
13891426
getI32Imm(VecType, DL),
13901427
getI32Imm(FromType, DL),
@@ -1434,7 +1471,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
14341471
}
14351472
if (!Opcode)
14361473
return false;
1437-
SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1474+
SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
1475+
getI32Imm(CodeMemorySem, DL),
14381476
getI32Imm(CodeAddrSpace, DL),
14391477
getI32Imm(VecType, DL),
14401478
getI32Imm(FromType, DL),
@@ -1889,7 +1927,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
18891927
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
18901928

18911929
// Memory Semantic Setting
1892-
unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
1930+
auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(ST, Subtarget);
18931931

18941932
// Vector Setting
18951933
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1926,6 +1964,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19261964
if (!Opcode)
19271965
return false;
19281966
SDValue Ops[] = {Value,
1967+
getI32Imm(SeqCstFence, dl),
19291968
getI32Imm(CodeMemorySem, dl),
19301969
getI32Imm(CodeAddrSpace, dl),
19311970
getI32Imm(vecType, dl),
@@ -1943,6 +1982,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19431982
if (!Opcode)
19441983
return false;
19451984
SDValue Ops[] = {Value,
1985+
getI32Imm(SeqCstFence, dl),
19461986
getI32Imm(CodeMemorySem, dl),
19471987
getI32Imm(CodeAddrSpace, dl),
19481988
getI32Imm(vecType, dl),
@@ -1968,6 +2008,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19682008
return false;
19692009

19702010
SDValue Ops[] = {Value,
2011+
getI32Imm(SeqCstFence, dl),
19712012
getI32Imm(CodeMemorySem, dl),
19722013
getI32Imm(CodeAddrSpace, dl),
19732014
getI32Imm(vecType, dl),
@@ -1990,6 +2031,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19902031
if (!Opcode)
19912032
return false;
19922033
SDValue Ops[] = {Value,
2034+
getI32Imm(SeqCstFence, dl),
19932035
getI32Imm(CodeMemorySem, dl),
19942036
getI32Imm(CodeAddrSpace, dl),
19952037
getI32Imm(vecType, dl),
@@ -2030,7 +2072,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
20302072
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
20312073

20322074
// Memory Semantic Setting
2033-
unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
2075+
auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
20342076

20352077
// Type Setting: toType + toTypeWidth
20362078
// - for integer type, always use 'u'
@@ -2072,6 +2114,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
20722114
ToTypeWidth = 32;
20732115
}
20742116

2117+
StOps.push_back(getI32Imm(SeqCstFence, DL));
20752118
StOps.push_back(getI32Imm(CodeMemorySem, DL));
20762119
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
20772120
StOps.push_back(getI32Imm(VecType, DL));

0 commit comments

Comments
 (0)