@@ -714,21 +714,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
714
714
return NVPTX::PTXLdStInstCode::GENERIC;
715
715
}
716
716
717
- static unsigned int getCodeMemorySemantic (MemSDNode *N,
718
- const NVPTXSubtarget *Subtarget) {
717
+ struct MemorySemantic {
718
+ unsigned int sem = -1 ;
719
+ unsigned int sc_fence = -1 ;
720
+ MemorySemantic (unsigned int s) : sem(s) {}
721
+ MemorySemantic (unsigned int s, unsigned int f) : sem(s), sc_fence(f) {}
722
+ };
723
+
724
+ static MemorySemantic getCodeMemorySemantic (MemSDNode *N,
725
+ const NVPTXSubtarget *Subtarget) {
719
726
AtomicOrdering Ordering = N->getSuccessOrdering ();
720
727
auto CodeAddrSpace = getCodeAddrSpace (N);
721
728
722
729
bool HasMemoryOrdering = Subtarget->hasMemoryOrdering ();
723
730
bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO ();
724
731
725
- // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
726
- // TODO: lowering for AcquireRelease Operations: for now, we error.
727
- //
728
-
729
732
// clang-format off
730
733
731
- // Lowering for non-SequentiallyConsistent Operations
734
+ // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
732
735
//
733
736
// | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
734
737
// |---------|----------|--------------------|------------|------------------------------|
@@ -748,6 +751,18 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
748
751
// | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
749
752
// | | | / Global [0] | | |
750
753
754
+ // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
755
+ // by following the ABI proven sound in:
756
+ // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
757
+ // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
758
+ //
759
+ // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
760
+ // |-----------------------------------------------------------------------------|-----------------------------------------|
761
+ // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; |
762
+ // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; ld.acquire.<scope>; |
763
+ // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; st.release.<scope>; |
764
+ // | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; atom.acq_rel.<scope>; |
765
+
751
766
// clang-format on
752
767
753
768
// [0]: volatile and atomics are only supported on global or shared
@@ -787,7 +802,6 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
787
802
// - the "weak" memory instruction we are currently lowering to, and
788
803
// - some other instruction that preserves the side-effect, e.g.,
789
804
// a dead dummy volatile load.
790
-
791
805
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
792
806
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
793
807
CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
@@ -865,7 +879,25 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
865
879
N->print (OS);
866
880
report_fatal_error (OS.str ());
867
881
}
868
- case AtomicOrdering::SequentiallyConsistent:
882
+ case AtomicOrdering::SequentiallyConsistent: {
883
+ unsigned int sem;
884
+ if (N->readMem ()) {
885
+ sem = NVPTX::PTXLdStInstCode::Acquire;
886
+ } else if (N->writeMem ()) {
887
+ sem = NVPTX::PTXLdStInstCode::Release;
888
+ } else {
889
+ SmallString<256 > Msg;
890
+ raw_svector_ostream OS (Msg);
891
+ OS << " NVPTX does not support SequentiallyConsistent Ordering on "
892
+ " read-modify-writes yet: "
893
+ << N->getOperationName ();
894
+ N->print (OS);
895
+ report_fatal_error (OS.str ());
896
+ }
897
+ return addrGenericOrGlobalOrShared
898
+ ? MemorySemantic (sem, NVPTX::PTXLdStInstCode::SeqCstFence)
899
+ : MemorySemantic (NVPTX::PTXLdStInstCode::NotAtomic);
900
+ }
869
901
case AtomicOrdering::Unordered:
870
902
// TODO: support AcquireRelease and SequentiallyConsistent
871
903
SmallString<256 > Msg;
@@ -1087,7 +1119,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1087
1119
}
1088
1120
1089
1121
// Memory Semantic Setting
1090
- unsigned int CodeMemorySem = getCodeMemorySemantic (LD, Subtarget);
1122
+ auto [ CodeMemorySem, SeqCstFence] = getCodeMemorySemantic (LD, Subtarget);
1091
1123
1092
1124
unsigned int PointerSize =
1093
1125
CurDAG->getDataLayout ().getPointerSizeInBits (LD->getAddressSpace ());
@@ -1132,7 +1164,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1132
1164
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
1133
1165
if (!Opcode)
1134
1166
return false ;
1135
- SDValue Ops[] = {getI32Imm (CodeMemorySem, dl),
1167
+ SDValue Ops[] = {getI32Imm (SeqCstFence, dl),
1168
+ getI32Imm (CodeMemorySem, dl),
1136
1169
getI32Imm (CodeAddrSpace, dl),
1137
1170
getI32Imm (vecType, dl),
1138
1171
getI32Imm (fromType, dl),
@@ -1147,7 +1180,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1147
1180
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
1148
1181
if (!Opcode)
1149
1182
return false ;
1150
- SDValue Ops[] = {getI32Imm (CodeMemorySem, dl),
1183
+ SDValue Ops[] = {getI32Imm (SeqCstFence, dl),
1184
+ getI32Imm (CodeMemorySem, dl),
1151
1185
getI32Imm (CodeAddrSpace, dl),
1152
1186
getI32Imm (vecType, dl),
1153
1187
getI32Imm (fromType, dl),
@@ -1169,7 +1203,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1169
1203
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
1170
1204
if (!Opcode)
1171
1205
return false ;
1172
- SDValue Ops[] = {getI32Imm (CodeMemorySem, dl),
1206
+ SDValue Ops[] = {getI32Imm (SeqCstFence, dl),
1207
+ getI32Imm (CodeMemorySem, dl),
1173
1208
getI32Imm (CodeAddrSpace, dl),
1174
1209
getI32Imm (vecType, dl),
1175
1210
getI32Imm (fromType, dl),
@@ -1190,7 +1225,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1190
1225
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
1191
1226
if (!Opcode)
1192
1227
return false ;
1193
- SDValue Ops[] = {getI32Imm (CodeMemorySem, dl),
1228
+ SDValue Ops[] = {getI32Imm (SeqCstFence, dl),
1229
+ getI32Imm (CodeMemorySem, dl),
1194
1230
getI32Imm (CodeAddrSpace, dl),
1195
1231
getI32Imm (vecType, dl),
1196
1232
getI32Imm (fromType, dl),
@@ -1234,7 +1270,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1234
1270
CurDAG->getDataLayout ().getPointerSizeInBits (MemSD->getAddressSpace ());
1235
1271
1236
1272
// Memory Semantic Setting
1237
- unsigned int CodeMemorySem = getCodeMemorySemantic (MemSD, Subtarget);
1273
+ auto [ CodeMemorySem, SeqCstFence] = getCodeMemorySemantic (MemSD, Subtarget);
1238
1274
1239
1275
// Vector Setting
1240
1276
MVT SimpleVT = LoadedVT.getSimpleVT ();
@@ -1301,7 +1337,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1301
1337
}
1302
1338
if (!Opcode)
1303
1339
return false ;
1304
- SDValue Ops[] = {getI32Imm (CodeMemorySem, DL),
1340
+ SDValue Ops[] = {getI32Imm (SeqCstFence, DL),
1341
+ getI32Imm (CodeMemorySem, DL),
1305
1342
getI32Imm (CodeAddrSpace, DL),
1306
1343
getI32Imm (VecType, DL),
1307
1344
getI32Imm (FromType, DL),
@@ -1330,7 +1367,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1330
1367
}
1331
1368
if (!Opcode)
1332
1369
return false ;
1333
- SDValue Ops[] = {getI32Imm (CodeMemorySem, DL),
1370
+ SDValue Ops[] = {getI32Imm (SeqCstFence, DL),
1371
+ getI32Imm (CodeMemorySem, DL),
1334
1372
getI32Imm (CodeAddrSpace, DL),
1335
1373
getI32Imm (VecType, DL),
1336
1374
getI32Imm (FromType, DL),
@@ -1380,7 +1418,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1380
1418
}
1381
1419
if (!Opcode)
1382
1420
return false ;
1383
- SDValue Ops[] = {getI32Imm (CodeMemorySem, DL),
1421
+ SDValue Ops[] = {getI32Imm (SeqCstFence, DL),
1422
+ getI32Imm (CodeMemorySem, DL),
1384
1423
getI32Imm (CodeAddrSpace, DL),
1385
1424
getI32Imm (VecType, DL),
1386
1425
getI32Imm (FromType, DL),
@@ -1430,7 +1469,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1430
1469
}
1431
1470
if (!Opcode)
1432
1471
return false ;
1433
- SDValue Ops[] = {getI32Imm (CodeMemorySem, DL),
1472
+ SDValue Ops[] = {getI32Imm (SeqCstFence, DL),
1473
+ getI32Imm (CodeMemorySem, DL),
1434
1474
getI32Imm (CodeAddrSpace, DL),
1435
1475
getI32Imm (VecType, DL),
1436
1476
getI32Imm (FromType, DL),
@@ -1885,7 +1925,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1885
1925
CurDAG->getDataLayout ().getPointerSizeInBits (ST->getAddressSpace ());
1886
1926
1887
1927
// Memory Semantic Setting
1888
- unsigned int CodeMemorySem = getCodeMemorySemantic (ST, Subtarget);
1928
+ auto [ CodeMemorySem, SeqCstFence] = getCodeMemorySemantic (ST, Subtarget);
1889
1929
1890
1930
// Vector Setting
1891
1931
MVT SimpleVT = StoreVT.getSimpleVT ();
@@ -1922,6 +1962,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1922
1962
if (!Opcode)
1923
1963
return false ;
1924
1964
SDValue Ops[] = {Value,
1965
+ getI32Imm (SeqCstFence, dl),
1925
1966
getI32Imm (CodeMemorySem, dl),
1926
1967
getI32Imm (CodeAddrSpace, dl),
1927
1968
getI32Imm (vecType, dl),
@@ -1939,6 +1980,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1939
1980
if (!Opcode)
1940
1981
return false ;
1941
1982
SDValue Ops[] = {Value,
1983
+ getI32Imm (SeqCstFence, dl),
1942
1984
getI32Imm (CodeMemorySem, dl),
1943
1985
getI32Imm (CodeAddrSpace, dl),
1944
1986
getI32Imm (vecType, dl),
@@ -1964,6 +2006,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1964
2006
return false ;
1965
2007
1966
2008
SDValue Ops[] = {Value,
2009
+ getI32Imm (SeqCstFence, dl),
1967
2010
getI32Imm (CodeMemorySem, dl),
1968
2011
getI32Imm (CodeAddrSpace, dl),
1969
2012
getI32Imm (vecType, dl),
@@ -1986,6 +2029,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1986
2029
if (!Opcode)
1987
2030
return false ;
1988
2031
SDValue Ops[] = {Value,
2032
+ getI32Imm (SeqCstFence, dl),
1989
2033
getI32Imm (CodeMemorySem, dl),
1990
2034
getI32Imm (CodeAddrSpace, dl),
1991
2035
getI32Imm (vecType, dl),
@@ -2026,7 +2070,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
2026
2070
CurDAG->getDataLayout ().getPointerSizeInBits (MemSD->getAddressSpace ());
2027
2071
2028
2072
// Memory Semantic Setting
2029
- unsigned int CodeMemorySem = getCodeMemorySemantic (MemSD, Subtarget);
2073
+ auto [ CodeMemorySem, SeqCstFence] = getCodeMemorySemantic (MemSD, Subtarget);
2030
2074
2031
2075
// Type Setting: toType + toTypeWidth
2032
2076
// - for integer type, always use 'u'
@@ -2068,6 +2112,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
2068
2112
ToTypeWidth = 32 ;
2069
2113
}
2070
2114
2115
+ StOps.push_back (getI32Imm (SeqCstFence, DL));
2071
2116
StOps.push_back (getI32Imm (CodeMemorySem, DL));
2072
2117
StOps.push_back (getI32Imm (CodeAddrSpace, DL));
2073
2118
StOps.push_back (getI32Imm (VecType, DL));
0 commit comments