@@ -16939,19 +16939,60 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16939
16939
case AtomicRMWInst::UDecWrap: {
16940
16940
if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
16941
16941
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16942
- // Always expand system scope atomics.
16943
- if (HasSystemScope) {
16944
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16945
- Op == AtomicRMWInst::Xor) {
16946
- // Atomic sub/or/xor do not work over PCI express, but atomic add
16947
- // does. InstCombine transforms these with 0 to or, so undo that.
16948
- if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16949
- ConstVal && ConstVal->isNullValue())
16950
- return AtomicExpansionKind::Expand;
16951
- }
16952
-
16953
- return AtomicExpansionKind::CmpXChg;
16942
+ // On most subtargets, for atomicrmw operations other than add/xchg,
16943
+ // whether or not the instructions will behave correctly depends on where
16944
+ // the address physically resides and what interconnect is used in the
16945
+ // system configuration. On some some targets the instruction will nop,
16946
+ // and in others synchronization will only occur at degraded device scope.
16947
+ //
16948
+ // If the allocation is known local to the device, the instructions should
16949
+ // work correctly.
16950
+ if (RMW->hasMetadata("amdgpu.no.remote.memory"))
16951
+ return atomicSupportedIfLegalIntType(RMW);
16952
+
16953
+ // If fine-grained remote memory works at device scope, we don't need to
16954
+ // do anything.
16955
+ if (!HasSystemScope &&
16956
+ Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
16957
+ return atomicSupportedIfLegalIntType(RMW);
16958
+
16959
+ // If we are targeting a remote allocated address, it depends what kind of
16960
+ // allocation the address belongs to.
16961
+ //
16962
+ // If the allocation is fine-grained (in host memory, or in PCIe peer
16963
+ // device memory), the operation will fail depending on the target.
16964
+ //
16965
+ // Note fine-grained host memory access does work on APUs or if XGMI is
16966
+ // used, but we do not know if we are targeting an APU or the system
16967
+ // configuration from the ISA version/target-cpu.
16968
+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
16969
+ return atomicSupportedIfLegalIntType(RMW);
16970
+
16971
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16972
+ Op == AtomicRMWInst::Xor) {
16973
+ // Atomic sub/or/xor do not work over PCI express, but atomic add
16974
+ // does. InstCombine transforms these with 0 to or, so undo that.
16975
+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16976
+ ConstVal && ConstVal->isNullValue())
16977
+ return AtomicExpansionKind::Expand;
16954
16978
}
16979
+
16980
+ // If the allocation could be in remote, fine-grained memory, the rmw
16981
+ // instructions may fail. cmpxchg should work, so emit that. On some
16982
+ // system configurations, PCIe atomics aren't supported so cmpxchg won't
16983
+ // even work, so you're out of luck anyway.
16984
+
16985
+ // In summary:
16986
+ //
16987
+ // Cases that may fail:
16988
+ // - fine-grained pinned host memory
16989
+ // - fine-grained migratable host memory
16990
+ // - fine-grained PCIe peer device
16991
+ //
16992
+ // Cases that should work, but may be treated overly conservatively.
16993
+ // - fine-grained host memory on an APU
16994
+ // - fine-grained XGMI peer device
16995
+ return AtomicExpansionKind::CmpXChg;
16955
16996
}
16956
16997
16957
16998
return atomicSupportedIfLegalIntType(RMW);
0 commit comments