@@ -59,6 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
59
59
60
60
bool NVPTXDAGToDAGISel::runOnMachineFunction (MachineFunction &MF) {
61
61
Subtarget = &MF.getSubtarget <NVPTXSubtarget>();
62
+ Scopes = NVPTXScopes (MF.getFunction ().getContext ());
62
63
return SelectionDAGISel::runOnMachineFunction (MF);
63
64
}
64
65
@@ -106,6 +107,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
106
107
if (tryStore (N))
107
108
return ;
108
109
break ;
110
+ case ISD::ATOMIC_FENCE:
111
+ if (tryFence (N))
112
+ return ;
113
+ break ;
109
114
case ISD::EXTRACT_VECTOR_ELT:
110
115
if (tryEXTRACT_VECTOR_ELEMENT (N))
111
116
return ;
@@ -915,6 +920,42 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
915
920
916
921
} // namespace
917
922
923
+ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope (MemSDNode *N,
924
+ NVPTX::Ordering Ord) const {
925
+ switch (Ord) {
926
+ case NVPTX::Ordering::NotAtomic:
927
+ case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
928
+ // NVPTX uses Thread scope as the scope of non-atomic operations.
929
+ return NVPTX::Scope::Thread;
930
+ case NVPTX::Ordering::RelaxedMMIO:
931
+ // RelaxedMMIO operations are always system scope.
932
+ // If a RelaxedMMIO order was generated from an atomic volatile operation
933
+ // with a smaller thread scope, we bump it here to system scope.
934
+ return NVPTX::Scope::System;
935
+ case NVPTX::Ordering::Relaxed:
936
+ case NVPTX::Ordering::Acquire:
937
+ case NVPTX::Ordering::Release:
938
+ case NVPTX::Ordering::AcquireRelease:
939
+ case NVPTX::Ordering::SequentiallyConsistent:
940
+ auto S = Scopes[N->getSyncScopeID ()];
941
+
942
+ // Atomic operations must have a scope greater than thread.
943
+ if (S == NVPTX::Scope::Thread)
944
+ report_fatal_error (
945
+ formatv (" Atomics need scope > \" {}\" ." , ScopeToString (S)));
946
+
947
+ // If scope is cluster, clusters must be supported.
948
+ if (S == NVPTX::Scope::Cluster)
949
+ Subtarget->requireClusters (" cluster scope" );
950
+
951
+ // If operation is volatile, then its scope is system.
952
+ if (N->isVolatile ())
953
+ S = NVPTX::Scope::System;
954
+
955
+ return S;
956
+ }
957
+ }
958
+
918
959
static bool canLowerToLDG (MemSDNode *N, const NVPTXSubtarget &Subtarget,
919
960
unsigned CodeAddrSpace, MachineFunction *F) {
920
961
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -957,33 +998,86 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
957
998
});
958
999
}
959
1000
960
- NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence (SDLoc DL,
961
- SDValue &Chain,
962
- MemSDNode *N) {
1001
+ static unsigned int getFenceOp (NVPTX::Ordering O, NVPTX::Scope S,
1002
+ NVPTXSubtarget const *T) {
1003
+ if (S == NVPTX::Scope::Cluster)
1004
+ T->requireClusters (" .cluster scope fence" );
1005
+
1006
+ switch (O) {
1007
+ case NVPTX::Ordering::Acquire:
1008
+ case NVPTX::Ordering::Release:
1009
+ case NVPTX::Ordering::AcquireRelease: {
1010
+ switch (S) {
1011
+ case NVPTX::Scope::System:
1012
+ return T->hasMemoryOrdering () ? NVPTX::atomic_thread_fence_acq_rel_sys
1013
+ : NVPTX::INT_MEMBAR_SYS;
1014
+ case NVPTX::Scope::Block:
1015
+ return T->hasMemoryOrdering () ? NVPTX::atomic_thread_fence_acq_rel_cta
1016
+ : NVPTX::INT_MEMBAR_CTA;
1017
+ case NVPTX::Scope::Cluster:
1018
+ return NVPTX::atomic_thread_fence_acq_rel_cluster;
1019
+ case NVPTX::Scope::Device:
1020
+ return T->hasMemoryOrdering () ? NVPTX::atomic_thread_fence_acq_rel_gpu
1021
+ : NVPTX::INT_MEMBAR_GL;
1022
+ case NVPTX::Scope::Thread:
1023
+ report_fatal_error (
1024
+ formatv (" Unsupported scope \" {}\" for acquire/release/acq_rel fence." ,
1025
+ ScopeToString (S)));
1026
+ }
1027
+ }
1028
+ case NVPTX::Ordering::SequentiallyConsistent: {
1029
+ switch (S) {
1030
+ case NVPTX::Scope::System:
1031
+ return T->hasMemoryOrdering () ? NVPTX::atomic_thread_fence_seq_cst_sys
1032
+ : NVPTX::INT_MEMBAR_SYS;
1033
+ case NVPTX::Scope::Block:
1034
+ return T->hasMemoryOrdering () ? NVPTX::atomic_thread_fence_seq_cst_cta
1035
+ : NVPTX::INT_MEMBAR_CTA;
1036
+ case NVPTX::Scope::Cluster:
1037
+ return NVPTX::atomic_thread_fence_seq_cst_cluster;
1038
+ case NVPTX::Scope::Device:
1039
+ return T->hasMemoryOrdering () ? NVPTX::atomic_thread_fence_seq_cst_gpu
1040
+ : NVPTX::INT_MEMBAR_GL;
1041
+ case NVPTX::Scope::Thread:
1042
+ report_fatal_error (formatv (" Unsupported scope \" {}\" for seq_cst fence." ,
1043
+ ScopeToString (S)));
1044
+ }
1045
+ }
1046
+ case NVPTX::Ordering::NotAtomic:
1047
+ case NVPTX::Ordering::Relaxed:
1048
+ case NVPTX::Ordering::Volatile:
1049
+ case NVPTX::Ordering::RelaxedMMIO:
1050
+ report_fatal_error (
1051
+ formatv (" Unsupported \" {}\" ordering and \" {}\" scope for fence." ,
1052
+ OrderingToString (O), ScopeToString (S)));
1053
+ }
1054
+ }
1055
+
1056
+ std::pair<NVPTX::Ordering, NVPTX::Scope>
1057
+ NVPTXDAGToDAGISel::insertMemoryInstructionFence (SDLoc DL, SDValue &Chain,
1058
+ MemSDNode *N) {
963
1059
// Some memory instructions - loads, stores, atomics - need an extra fence
964
1060
// instruction. Get the memory order of the instruction, and that of its
965
1061
// fence, if any.
966
1062
auto [InstructionOrdering, FenceOrdering] =
967
1063
getOperationOrderings (N, Subtarget);
1064
+ auto Scope = getOperationScope (N, InstructionOrdering);
968
1065
969
1066
// If a fence is required before the operation, insert it:
970
1067
switch (NVPTX::Ordering (FenceOrdering)) {
971
1068
case NVPTX::Ordering::NotAtomic:
972
1069
break ;
973
1070
case NVPTX::Ordering::SequentiallyConsistent: {
974
- unsigned Op = Subtarget->hasMemoryOrdering ()
975
- ? NVPTX::atomic_thread_fence_seq_cst_sys
976
- : NVPTX::INT_MEMBAR_SYS;
1071
+ auto Op = getFenceOp (FenceOrdering, Scope, Subtarget);
977
1072
Chain = SDValue (CurDAG->getMachineNode (Op, DL, MVT::Other, Chain), 0 );
978
1073
break ;
979
1074
}
980
1075
default :
981
1076
report_fatal_error (
982
1077
formatv (" Unexpected fence ordering: \" {}\" ." ,
983
- OrderingToCString (NVPTX::Ordering (FenceOrdering))));
1078
+ OrderingToString (NVPTX::Ordering (FenceOrdering))));
984
1079
}
985
-
986
- return InstructionOrdering;
1080
+ return std::make_pair (InstructionOrdering, Scope);
987
1081
}
988
1082
989
1083
bool NVPTXDAGToDAGISel::tryIntrinsicNoChain (SDNode *N) {
@@ -1154,7 +1248,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1154
1248
1155
1249
SDLoc DL (N);
1156
1250
SDValue Chain = N->getOperand (0 );
1157
- auto InstructionOrdering = insertMemoryInstructionFence (DL, Chain, LD);
1251
+ auto [Ordering, Scope] = insertMemoryInstructionFence (DL, Chain, LD);
1158
1252
1159
1253
// Type Setting: fromType + fromTypeWidth
1160
1254
//
@@ -1189,7 +1283,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1189
1283
std::optional<unsigned > Opcode;
1190
1284
MVT::SimpleValueType TargetVT = LD->getSimpleValueType (0 ).SimpleTy ;
1191
1285
1192
- SmallVector<SDValue, 12 > Ops ({getI32Imm (InstructionOrdering , DL),
1286
+ SmallVector<SDValue, 12 > Ops ({getI32Imm (Ordering, DL), getI32Imm (Scope , DL),
1193
1287
getI32Imm (CodeAddrSpace, DL),
1194
1288
getI32Imm (VecType, DL), getI32Imm (FromType, DL),
1195
1289
getI32Imm (FromTypeWidth, DL)});
@@ -1266,7 +1360,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1266
1360
1267
1361
SDLoc DL (N);
1268
1362
SDValue Chain = N->getOperand (0 );
1269
- auto InstructionOrdering = insertMemoryInstructionFence (DL, Chain, MemSD);
1363
+ auto [Ordering, Scope] = insertMemoryInstructionFence (DL, Chain, MemSD);
1270
1364
1271
1365
// Vector Setting
1272
1366
MVT SimpleVT = LoadedVT.getSimpleVT ();
@@ -1319,7 +1413,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1319
1413
std::optional<unsigned > Opcode;
1320
1414
SDNode *LD;
1321
1415
1322
- SmallVector<SDValue, 12 > Ops ({getI32Imm (InstructionOrdering , DL),
1416
+ SmallVector<SDValue, 12 > Ops ({getI32Imm (Ordering, DL), getI32Imm (Scope , DL),
1323
1417
getI32Imm (CodeAddrSpace, DL),
1324
1418
getI32Imm (VecType, DL), getI32Imm (FromType, DL),
1325
1419
getI32Imm (FromTypeWidth, DL)});
@@ -1895,7 +1989,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1895
1989
1896
1990
SDLoc DL (N);
1897
1991
SDValue Chain = ST->getChain ();
1898
- auto InstructionOrdering = insertMemoryInstructionFence (DL, Chain, ST);
1992
+ auto [Ordering, Scope] = insertMemoryInstructionFence (DL, Chain, ST);
1899
1993
1900
1994
// Vector Setting
1901
1995
MVT SimpleVT = StoreVT.getSimpleVT ();
@@ -1923,10 +2017,10 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1923
2017
MVT::SimpleValueType SourceVT =
1924
2018
Value.getNode ()->getSimpleValueType (0 ).SimpleTy ;
1925
2019
1926
- SmallVector<SDValue, 12 > Ops ({Value, getI32Imm (InstructionOrdering, DL),
1927
- getI32Imm (CodeAddrSpace , DL),
1928
- getI32Imm (VecType , DL), getI32Imm (ToType , DL),
1929
- getI32Imm (ToTypeWidth, DL)});
2020
+ SmallVector<SDValue, 12 > Ops (
2021
+ {Value, getI32Imm (Ordering, DL), getI32Imm (Scope , DL),
2022
+ getI32Imm (CodeAddrSpace , DL), getI32Imm (VecType , DL),
2023
+ getI32Imm (ToType, DL), getI32Imm (ToTypeWidth, DL)});
1930
2024
1931
2025
if (SelectDirectAddr (BasePtr, Addr)) {
1932
2026
Opcode = pickOpcodeForVT (SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
@@ -2005,7 +2099,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
2005
2099
2006
2100
SDLoc DL (N);
2007
2101
SDValue Chain = N->getOperand (0 );
2008
- auto InstructionOrdering = insertMemoryInstructionFence (DL, Chain, MemSD);
2102
+ auto [Ordering, Scope] = insertMemoryInstructionFence (DL, Chain, MemSD);
2009
2103
2010
2104
// Type Setting: toType + toTypeWidth
2011
2105
// - for integer type, always use 'u'
@@ -2044,9 +2138,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
2044
2138
ToTypeWidth = 32 ;
2045
2139
}
2046
2140
2047
- Ops.append ({getI32Imm (InstructionOrdering , DL), getI32Imm (CodeAddrSpace , DL),
2048
- getI32Imm (VecType , DL), getI32Imm (ToType , DL),
2049
- getI32Imm (ToTypeWidth, DL)});
2141
+ Ops.append ({getI32Imm (Ordering , DL), getI32Imm (Scope , DL),
2142
+ getI32Imm (CodeAddrSpace , DL), getI32Imm (VecType , DL),
2143
+ getI32Imm (ToType, DL), getI32Imm ( ToTypeWidth, DL)});
2050
2144
2051
2145
if (SelectDirectAddr (N2, Addr)) {
2052
2146
switch (N->getOpcode ()) {
@@ -4064,3 +4158,43 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
4064
4158
}
4065
4159
}
4066
4160
}
4161
+
4162
+ bool NVPTXDAGToDAGISel::tryFence (SDNode *N) {
4163
+ SDLoc DL (N);
4164
+ assert (N->getOpcode () == ISD::ATOMIC_FENCE);
4165
+ unsigned int FenceOp =
4166
+ getFenceOp (NVPTX::Ordering (N->getConstantOperandVal (1 )),
4167
+ Scopes[N->getConstantOperandVal (2 )], Subtarget);
4168
+ SDValue Chain = N->getOperand (0 );
4169
+ SDNode *FenceNode = CurDAG->getMachineNode (FenceOp, DL, MVT::Other, Chain);
4170
+ ReplaceNode (N, FenceNode);
4171
+ return true ;
4172
+ }
4173
+
4174
+ NVPTXScopes::NVPTXScopes (LLVMContext &C) : CTX(&C) {
4175
+ Scopes[C.getOrInsertSyncScopeID (" singlethread" )] = NVPTX::Scope::Thread;
4176
+ Scopes[C.getOrInsertSyncScopeID (" " )] = NVPTX::Scope::System;
4177
+ Scopes[C.getOrInsertSyncScopeID (" block" )] = NVPTX::Scope::Block;
4178
+ Scopes[C.getOrInsertSyncScopeID (" cluster" )] = NVPTX::Scope::Cluster;
4179
+ Scopes[C.getOrInsertSyncScopeID (" device" )] = NVPTX::Scope::Device;
4180
+ }
4181
+
4182
+ NVPTX::Scope NVPTXScopes::operator [](SyncScope::ID ID) const {
4183
+ if (Scopes.empty ())
4184
+ report_fatal_error (" NVPTX Scopes must be initialized before calling "
4185
+ " NVPTXScopes::operator[]" );
4186
+
4187
+ auto S = Scopes.find (ID);
4188
+ if (S == Scopes.end ()) {
4189
+ SmallVector<StringRef, 8 > ScopeNames;
4190
+ assert (CTX != nullptr && " CTX is nullptr" );
4191
+ CTX->getSyncScopeNames (ScopeNames);
4192
+ StringRef Unknown{" unknown" };
4193
+ auto Name = ID < ScopeNames.size () ? ScopeNames[ID] : Unknown;
4194
+ report_fatal_error (
4195
+ formatv (" Could not find scope ID={} with name \" {}\" ." , int (ID), Name));
4196
+ }
4197
+ return S->second ;
4198
+ }
4199
+
4200
+ bool NVPTXScopes::empty () const { return Scopes.size () == 0 ; }
0 commit comments