@@ -714,21 +714,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
714
714
return NVPTX::PTXLdStInstCode::GENERIC;
715
715
}
716
716
717
- static unsigned int getCodeMemorySemantic (MemSDNode *N,
718
- const NVPTXSubtarget *Subtarget) {
717
+ struct MemorySemantic {
718
+ unsigned int sem = -1 ;
719
+ unsigned int sc_fence = -1 ;
720
+ MemorySemantic (unsigned int s) : sem(s) {}
721
+ MemorySemantic (unsigned int s, unsigned int f) : sem(s), sc_fence(f) {}
722
+ };
723
+
724
+ static MemorySemantic getCodeMemorySemantic (MemSDNode *N,
725
+ const NVPTXSubtarget *Subtarget) {
719
726
AtomicOrdering Ordering = N->getSuccessOrdering ();
720
727
auto CodeAddrSpace = getCodeAddrSpace (N);
721
728
722
729
bool HasMemoryOrdering = Subtarget->hasMemoryOrdering ();
723
730
bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO ();
724
731
725
- // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
726
- // TODO: lowering for AcquireRelease Operations: for now, we error.
727
- //
728
-
729
732
// clang-format off
730
733
731
- // Lowering for non-SequentiallyConsistent Operations
734
+ // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
732
735
//
733
736
// | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
734
737
// |---------|----------|--------------------|------------|------------------------------|
@@ -749,6 +752,18 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
749
752
// | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
750
753
// | | | / Global [0] | | |
751
754
755
+ // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
756
+ // by following the ABI proven sound in:
757
+ // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
758
+ // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
759
+ //
760
+ // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
761
+ // |-----------------------------------------------------------------------------|-----------------------------------------|
762
+ // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; |
763
+ // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; ld.acquire.<scope>; |
764
+ // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; st.release.<scope>; |
765
+ // | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; atom.acq_rel.<scope>; |
766
+
752
767
// clang-format on
753
768
754
769
// [0]: volatile and atomics are only supported on global or shared
@@ -788,7 +803,6 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
788
803
// - the "weak" memory instruction we are currently lowering to, and
789
804
// - some other instruction that preserves the side-effect, e.g.,
790
805
// a dead dummy volatile load.
791
-
792
806
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
793
807
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
794
808
CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
@@ -870,16 +884,32 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
870
884
N->print (OS);
871
885
report_fatal_error (OS.str ());
872
886
}
873
- case AtomicOrdering::SequentiallyConsistent:
874
- // TODO: support AcquireRelease and SequentiallyConsistent
875
- SmallString<256 > Msg;
876
- raw_svector_ostream OS (Msg);
877
- OS << " NVPTX backend does not support AtomicOrdering \" "
878
- << toIRString (Ordering) << " \" yet." ;
879
- report_fatal_error (OS.str ());
887
+ case AtomicOrdering::SequentiallyConsistent: {
888
+ unsigned int sem;
889
+ if (N->readMem ()) {
890
+ sem = NVPTX::PTXLdStInstCode::Acquire;
891
+ } else if (N->writeMem ()) {
892
+ sem = NVPTX::PTXLdStInstCode::Release;
893
+ } else {
894
+ SmallString<256 > Msg;
895
+ raw_svector_ostream OS (Msg);
896
+ OS << " NVPTX does not support SequentiallyConsistent Ordering on "
897
+ " read-modify-writes yet: "
898
+ << N->getOperationName ();
899
+ N->print (OS);
900
+ report_fatal_error (OS.str ());
901
+ }
902
+ return addrGenericOrGlobalOrShared
903
+ ? MemorySemantic (sem, NVPTX::PTXLdStInstCode::SeqCstFence)
904
+ : MemorySemantic (NVPTX::PTXLdStInstCode::NotAtomic);
905
+ }
880
906
}
881
907
882
- llvm_unreachable (" unexpected unhandled case" );
908
+ SmallString<256 > Msg;
909
+ raw_svector_ostream OS (Msg);
910
+ OS << " NVPTX backend does not support AtomicOrdering \" "
911
+ << toIRString (Ordering) << " \" yet." ;
912
+ report_fatal_error (OS.str ());
883
913
}
884
914
885
915
static bool canLowerToLDG (MemSDNode *N, const NVPTXSubtarget &Subtarget,
@@ -1091,7 +1121,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1091
1121
}
1092
1122
1093
1123
// Memory Semantic Setting
1094
- unsigned int CodeMemorySem = getCodeMemorySemantic (LD, Subtarget);
1124
+ auto [ CodeMemorySem, SeqCstFence] = getCodeMemorySemantic (LD, Subtarget);
1095
1125
1096
1126
unsigned int PointerSize =
1097
1127
CurDAG->getDataLayout ().getPointerSizeInBits (LD->getAddressSpace ());
@@ -1136,7 +1166,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1136
1166
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
1137
1167
if (!Opcode)
1138
1168
return false ;
1139
- SDValue Ops[] = {getI32Imm (CodeMemorySem, dl),
1169
+ SDValue Ops[] = {getI32Imm (SeqCstFence, dl),
1170
+ getI32Imm (CodeMemorySem, dl),
1140
1171
getI32Imm (CodeAddrSpace, dl),
1141
1172
getI32Imm (vecType, dl),
1142
1173
getI32Imm (fromType, dl),
@@ -1151,7 +1182,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1151
1182
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
1152
1183
if (!Opcode)
1153
1184
return false ;
1154
- SDValue Ops[] = {getI32Imm (CodeMemorySem, dl),
1185
+ SDValue Ops[] = {getI32Imm (SeqCstFence, dl),
1186
+ getI32Imm (CodeMemorySem, dl),
1155
1187
getI32Imm (CodeAddrSpace, dl),
1156
1188
getI32Imm (vecType, dl),
1157
1189
getI32Imm (fromType, dl),
@@ -1173,7 +1205,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1173
1205
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
1174
1206
if (!Opcode)
1175
1207
return false ;
1176
- SDValue Ops[] = {getI32Imm (CodeMemorySem, dl),
1208
+ SDValue Ops[] = {getI32Imm (SeqCstFence, dl),
1209
+ getI32Imm (CodeMemorySem, dl),
1177
1210
getI32Imm (CodeAddrSpace, dl),
1178
1211
getI32Imm (vecType, dl),
1179
1212
getI32Imm (fromType, dl),
@@ -1194,7 +1227,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1194
1227
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
1195
1228
if (!Opcode)
1196
1229
return false ;
1197
- SDValue Ops[] = {getI32Imm (CodeMemorySem, dl),
1230
+ SDValue Ops[] = {getI32Imm (SeqCstFence, dl),
1231
+ getI32Imm (CodeMemorySem, dl),
1198
1232
getI32Imm (CodeAddrSpace, dl),
1199
1233
getI32Imm (vecType, dl),
1200
1234
getI32Imm (fromType, dl),
@@ -1238,7 +1272,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1238
1272
CurDAG->getDataLayout ().getPointerSizeInBits (MemSD->getAddressSpace ());
1239
1273
1240
1274
// Memory Semantic Setting
1241
- unsigned int CodeMemorySem = getCodeMemorySemantic (MemSD, Subtarget);
1275
+ auto [ CodeMemorySem, SeqCstFence] = getCodeMemorySemantic (MemSD, Subtarget);
1242
1276
1243
1277
// Vector Setting
1244
1278
MVT SimpleVT = LoadedVT.getSimpleVT ();
@@ -1305,7 +1339,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1305
1339
}
1306
1340
if (!Opcode)
1307
1341
return false ;
1308
- SDValue Ops[] = {getI32Imm (CodeMemorySem, DL),
1342
+ SDValue Ops[] = {getI32Imm (SeqCstFence, DL),
1343
+ getI32Imm (CodeMemorySem, DL),
1309
1344
getI32Imm (CodeAddrSpace, DL),
1310
1345
getI32Imm (VecType, DL),
1311
1346
getI32Imm (FromType, DL),
@@ -1334,7 +1369,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1334
1369
}
1335
1370
if (!Opcode)
1336
1371
return false ;
1337
- SDValue Ops[] = {getI32Imm (CodeMemorySem, DL),
1372
+ SDValue Ops[] = {getI32Imm (SeqCstFence, DL),
1373
+ getI32Imm (CodeMemorySem, DL),
1338
1374
getI32Imm (CodeAddrSpace, DL),
1339
1375
getI32Imm (VecType, DL),
1340
1376
getI32Imm (FromType, DL),
@@ -1384,7 +1420,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1384
1420
}
1385
1421
if (!Opcode)
1386
1422
return false ;
1387
- SDValue Ops[] = {getI32Imm (CodeMemorySem, DL),
1423
+ SDValue Ops[] = {getI32Imm (SeqCstFence, DL),
1424
+ getI32Imm (CodeMemorySem, DL),
1388
1425
getI32Imm (CodeAddrSpace, DL),
1389
1426
getI32Imm (VecType, DL),
1390
1427
getI32Imm (FromType, DL),
@@ -1434,7 +1471,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1434
1471
}
1435
1472
if (!Opcode)
1436
1473
return false ;
1437
- SDValue Ops[] = {getI32Imm (CodeMemorySem, DL),
1474
+ SDValue Ops[] = {getI32Imm (SeqCstFence, DL),
1475
+ getI32Imm (CodeMemorySem, DL),
1438
1476
getI32Imm (CodeAddrSpace, DL),
1439
1477
getI32Imm (VecType, DL),
1440
1478
getI32Imm (FromType, DL),
@@ -1889,7 +1927,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1889
1927
CurDAG->getDataLayout ().getPointerSizeInBits (ST->getAddressSpace ());
1890
1928
1891
1929
// Memory Semantic Setting
1892
- unsigned int CodeMemorySem = getCodeMemorySemantic (ST, Subtarget);
1930
+ auto [ CodeMemorySem, SeqCstFence] = getCodeMemorySemantic (ST, Subtarget);
1893
1931
1894
1932
// Vector Setting
1895
1933
MVT SimpleVT = StoreVT.getSimpleVT ();
@@ -1926,6 +1964,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1926
1964
if (!Opcode)
1927
1965
return false ;
1928
1966
SDValue Ops[] = {Value,
1967
+ getI32Imm (SeqCstFence, dl),
1929
1968
getI32Imm (CodeMemorySem, dl),
1930
1969
getI32Imm (CodeAddrSpace, dl),
1931
1970
getI32Imm (vecType, dl),
@@ -1943,6 +1982,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1943
1982
if (!Opcode)
1944
1983
return false ;
1945
1984
SDValue Ops[] = {Value,
1985
+ getI32Imm (SeqCstFence, dl),
1946
1986
getI32Imm (CodeMemorySem, dl),
1947
1987
getI32Imm (CodeAddrSpace, dl),
1948
1988
getI32Imm (vecType, dl),
@@ -1968,6 +2008,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1968
2008
return false ;
1969
2009
1970
2010
SDValue Ops[] = {Value,
2011
+ getI32Imm (SeqCstFence, dl),
1971
2012
getI32Imm (CodeMemorySem, dl),
1972
2013
getI32Imm (CodeAddrSpace, dl),
1973
2014
getI32Imm (vecType, dl),
@@ -1990,6 +2031,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1990
2031
if (!Opcode)
1991
2032
return false ;
1992
2033
SDValue Ops[] = {Value,
2034
+ getI32Imm (SeqCstFence, dl),
1993
2035
getI32Imm (CodeMemorySem, dl),
1994
2036
getI32Imm (CodeAddrSpace, dl),
1995
2037
getI32Imm (vecType, dl),
@@ -2030,7 +2072,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
2030
2072
CurDAG->getDataLayout ().getPointerSizeInBits (MemSD->getAddressSpace ());
2031
2073
2032
2074
// Memory Semantic Setting
2033
- unsigned int CodeMemorySem = getCodeMemorySemantic (MemSD, Subtarget);
2075
+ auto [ CodeMemorySem, SeqCstFence] = getCodeMemorySemantic (MemSD, Subtarget);
2034
2076
2035
2077
// Type Setting: toType + toTypeWidth
2036
2078
// - for integer type, always use 'u'
@@ -2072,6 +2114,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
2072
2114
ToTypeWidth = 32 ;
2073
2115
}
2074
2116
2117
+ StOps.push_back (getI32Imm (SeqCstFence, DL));
2075
2118
StOps.push_back (getI32Imm (CodeMemorySem, DL));
2076
2119
StOps.push_back (getI32Imm (CodeAddrSpace, DL));
2077
2120
StOps.push_back (getI32Imm (VecType, DL));
0 commit comments