Skip to content

Commit 93b5524

Browse files
committed
[NVPTX] Volta SequentiallyConsistent Load/Store Ops
1 parent 021fd6d commit 93b5524

File tree

5 files changed

+205
-94
lines changed

5 files changed

+205
-94
lines changed

llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,16 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
254254
report_fatal_error(OS.str());
255255
break;
256256
}
257+
} else if (!strcmp(Modifier, "sc")) {
258+
switch (Imm) {
259+
// TODO: refactor fence insertion in ISelDagToDag instead of here
260+
// as part of implementing atomicrmw seq_cst.
261+
case NVPTX::PTXLdStInstCode::SeqCstFence:
262+
O << "fence.sc.sys;\n\t";
263+
break;
264+
default:
265+
break;
266+
}
257267
} else if (!strcmp(Modifier, "addsp")) {
258268
switch (Imm) {
259269
case NVPTX::PTXLdStInstCode::GLOBAL:

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,8 @@ enum MemorySemantic {
113113
Relaxed = 2,
114114
Acquire = 3,
115115
Release = 4,
116-
RelaxedMMIO = 5
116+
RelaxedMMIO = 5,
117+
SeqCstFence = 6,
117118
};
118119
enum AddressSpace {
119120
GENERIC = 0,

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 66 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -714,21 +714,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
714714
return NVPTX::PTXLdStInstCode::GENERIC;
715715
}
716716

717-
static unsigned int getCodeMemorySemantic(MemSDNode *N,
718-
const NVPTXSubtarget *Subtarget) {
717+
struct MemorySemantic {
718+
unsigned int sem = -1;
719+
unsigned int sc_fence = -1;
720+
MemorySemantic(unsigned int s) : sem(s) {}
721+
MemorySemantic(unsigned int s, unsigned int f) : sem(s), sc_fence(f) {}
722+
};
723+
724+
static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
725+
const NVPTXSubtarget *Subtarget) {
719726
AtomicOrdering Ordering = N->getSuccessOrdering();
720727
auto CodeAddrSpace = getCodeAddrSpace(N);
721728

722729
bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
723730
bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
724731

725-
// TODO: lowering for SequentiallyConsistent Operations: for now, we error.
726-
// TODO: lowering for AcquireRelease Operations: for now, we error.
727-
//
728-
729732
// clang-format off
730733

731-
// Lowering for non-SequentiallyConsistent Operations
734+
// Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
732735
//
733736
// | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
734737
// |---------|----------|--------------------|------------|------------------------------|
@@ -748,6 +751,18 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
748751
// | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
749752
// | | | / Global [0] | | |
750753

754+
// Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
755+
// by following the ABI proven sound in:
756+
// Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
757+
// https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
758+
//
759+
// | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
760+
// |-----------------------------------------------------------------------------|-----------------------------------------|
761+
// | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; |
762+
// | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; ld.acquire.<scope>; |
763+
// | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; st.release.<scope>; |
764+
// | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; atom.acq_rel.<scope>; |
765+
751766
// clang-format on
752767

753768
// [0]: volatile and atomics are only supported on global or shared
@@ -787,7 +802,6 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
787802
// - the "weak" memory instruction we are currently lowering to, and
788803
// - some other instruction that preserves the side-effect, e.g.,
789804
// a dead dummy volatile load.
790-
791805
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
792806
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
793807
CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
@@ -865,7 +879,25 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
865879
N->print(OS);
866880
report_fatal_error(OS.str());
867881
}
868-
case AtomicOrdering::SequentiallyConsistent:
882+
case AtomicOrdering::SequentiallyConsistent: {
883+
unsigned int sem;
884+
if (N->readMem()) {
885+
sem = NVPTX::PTXLdStInstCode::Acquire;
886+
} else if (N->writeMem()) {
887+
sem = NVPTX::PTXLdStInstCode::Release;
888+
} else {
889+
SmallString<256> Msg;
890+
raw_svector_ostream OS(Msg);
891+
OS << "NVPTX does not support SequentiallyConsistent Ordering on "
892+
"read-modify-writes yet: "
893+
<< N->getOperationName();
894+
N->print(OS);
895+
report_fatal_error(OS.str());
896+
}
897+
return addrGenericOrGlobalOrShared
898+
? MemorySemantic(sem, NVPTX::PTXLdStInstCode::SeqCstFence)
899+
: MemorySemantic(NVPTX::PTXLdStInstCode::NotAtomic);
900+
}
869901
case AtomicOrdering::Unordered:
870902
// TODO: support AcquireRelease and SequentiallyConsistent
871903
SmallString<256> Msg;
@@ -1087,7 +1119,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
10871119
}
10881120

10891121
// Memory Semantic Setting
1090-
unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
1122+
auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(LD, Subtarget);
10911123

10921124
unsigned int PointerSize =
10931125
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
@@ -1132,7 +1164,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11321164
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
11331165
if (!Opcode)
11341166
return false;
1135-
SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1167+
SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
1168+
getI32Imm(CodeMemorySem, dl),
11361169
getI32Imm(CodeAddrSpace, dl),
11371170
getI32Imm(vecType, dl),
11381171
getI32Imm(fromType, dl),
@@ -1147,7 +1180,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11471180
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
11481181
if (!Opcode)
11491182
return false;
1150-
SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1183+
SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
1184+
getI32Imm(CodeMemorySem, dl),
11511185
getI32Imm(CodeAddrSpace, dl),
11521186
getI32Imm(vecType, dl),
11531187
getI32Imm(fromType, dl),
@@ -1169,7 +1203,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11691203
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
11701204
if (!Opcode)
11711205
return false;
1172-
SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1206+
SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
1207+
getI32Imm(CodeMemorySem, dl),
11731208
getI32Imm(CodeAddrSpace, dl),
11741209
getI32Imm(vecType, dl),
11751210
getI32Imm(fromType, dl),
@@ -1190,7 +1225,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
11901225
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
11911226
if (!Opcode)
11921227
return false;
1193-
SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1228+
SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
1229+
getI32Imm(CodeMemorySem, dl),
11941230
getI32Imm(CodeAddrSpace, dl),
11951231
getI32Imm(vecType, dl),
11961232
getI32Imm(fromType, dl),
@@ -1234,7 +1270,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
12341270
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
12351271

12361272
// Memory Semantic Setting
1237-
unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
1273+
auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
12381274

12391275
// Vector Setting
12401276
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1301,7 +1337,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
13011337
}
13021338
if (!Opcode)
13031339
return false;
1304-
SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1340+
SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
1341+
getI32Imm(CodeMemorySem, DL),
13051342
getI32Imm(CodeAddrSpace, DL),
13061343
getI32Imm(VecType, DL),
13071344
getI32Imm(FromType, DL),
@@ -1330,7 +1367,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
13301367
}
13311368
if (!Opcode)
13321369
return false;
1333-
SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1370+
SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
1371+
getI32Imm(CodeMemorySem, DL),
13341372
getI32Imm(CodeAddrSpace, DL),
13351373
getI32Imm(VecType, DL),
13361374
getI32Imm(FromType, DL),
@@ -1380,7 +1418,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
13801418
}
13811419
if (!Opcode)
13821420
return false;
1383-
SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1421+
SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
1422+
getI32Imm(CodeMemorySem, DL),
13841423
getI32Imm(CodeAddrSpace, DL),
13851424
getI32Imm(VecType, DL),
13861425
getI32Imm(FromType, DL),
@@ -1430,7 +1469,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
14301469
}
14311470
if (!Opcode)
14321471
return false;
1433-
SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1472+
SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
1473+
getI32Imm(CodeMemorySem, DL),
14341474
getI32Imm(CodeAddrSpace, DL),
14351475
getI32Imm(VecType, DL),
14361476
getI32Imm(FromType, DL),
@@ -1885,7 +1925,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
18851925
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
18861926

18871927
// Memory Semantic Setting
1888-
unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
1928+
auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(ST, Subtarget);
18891929

18901930
// Vector Setting
18911931
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1922,6 +1962,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19221962
if (!Opcode)
19231963
return false;
19241964
SDValue Ops[] = {Value,
1965+
getI32Imm(SeqCstFence, dl),
19251966
getI32Imm(CodeMemorySem, dl),
19261967
getI32Imm(CodeAddrSpace, dl),
19271968
getI32Imm(vecType, dl),
@@ -1939,6 +1980,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19391980
if (!Opcode)
19401981
return false;
19411982
SDValue Ops[] = {Value,
1983+
getI32Imm(SeqCstFence, dl),
19421984
getI32Imm(CodeMemorySem, dl),
19431985
getI32Imm(CodeAddrSpace, dl),
19441986
getI32Imm(vecType, dl),
@@ -1964,6 +2006,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19642006
return false;
19652007

19662008
SDValue Ops[] = {Value,
2009+
getI32Imm(SeqCstFence, dl),
19672010
getI32Imm(CodeMemorySem, dl),
19682011
getI32Imm(CodeAddrSpace, dl),
19692012
getI32Imm(vecType, dl),
@@ -1986,6 +2029,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
19862029
if (!Opcode)
19872030
return false;
19882031
SDValue Ops[] = {Value,
2032+
getI32Imm(SeqCstFence, dl),
19892033
getI32Imm(CodeMemorySem, dl),
19902034
getI32Imm(CodeAddrSpace, dl),
19912035
getI32Imm(vecType, dl),
@@ -2026,7 +2070,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
20262070
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
20272071

20282072
// Memory Semantic Setting
2029-
unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
2073+
auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
20302074

20312075
// Type Setting: toType + toTypeWidth
20322076
// - for integer type, always use 'u'
@@ -2068,6 +2112,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
20682112
ToTypeWidth = 32;
20692113
}
20702114

2115+
StOps.push_back(getI32Imm(SeqCstFence, DL));
20712116
StOps.push_back(getI32Imm(CodeMemorySem, DL));
20722117
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
20732118
StOps.push_back(getI32Imm(VecType, DL));

0 commit comments

Comments
 (0)