@@ -730,31 +730,63 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
730
730
731
731
// Lowering for non-SequentiallyConsistent Operations
732
732
//
733
- // | Atomic | Volatile | Statespace | Lowering sm_60- | Lowering sm_70+ |
734
- // |---------|----------|-------------------------------|-----------------|------------------------------------------------------|
735
- // | No | No | All | plain | .weak |
736
- // | No | Yes | Generic / Shared / Global [0] | .volatile | .volatile |
737
- // | No | Yes | Local / Const / Param | plain [1] | .weak [1] |
738
- // | Relaxed | No | Generic / Shared / Global [0] | .volatile | <atomic sem> |
739
- // | Other | No | Generic / Shared / Global [0] | Error [2] | <atomic sem> |
740
- // | Yes | No | Local / Const / Param | plain [1] | .weak [1] |
741
- // | Relaxed | Yes | Generic / Shared [0] | .volatile | .volatile |
742
- // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) or .volatile (PTX 8.1-) |
743
- // | Relaxed | Yes | Local / Const / Param | plain [1] | .weak [1] |
744
- // | Other | Yes | Generic / Shared / Global [0] | Error [2] | <atomic sem> [3] |
733
+ // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
734
+ // |---------|----------|--------------------|------------|------------------------------|
735
+ // | No | No | All | plain | .weak |
736
+ // | No | Yes | Generic,Shared, | .volatile | .volatile |
737
+ // | | | Global [0] | | |
738
+ // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
739
+ // | Relaxed | No | Generic,Shared, | | |
740
+ // | | | Global [0] | .volatile | <atomic sem> |
741
+ // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
742
+ // | | | Global [0] | | |
743
+ // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
744
+ // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
745
+ // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
746
+ // | | | | | or .volatile (PTX 8.1-) |
747
+ // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
748
+ // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
749
+ // | | | / Global [0] | | |
745
750
746
751
// clang-format on
747
752
748
- // [0]: volatile and atomics are only supported on generic addressing to
749
- // shared or global, or shared, or global.
750
- // MMIO requires generic addressing to global or global, but
751
- // (TODO) we only implement it for global.
753
+ // [0]: volatile and atomics are only supported on global or shared
754
+ // memory locations, accessed via generic/shared/global pointers.
755
+ // MMIO is only supported on global memory locations,
756
+ // accessed via generic/global pointers.
757
+ // TODO: Implement MMIO access via generic pointer to global.
758
+ // Currently implemented for global pointers only.
752
759
753
- // [1]: TODO: this implementation exhibits PTX Undefined Behavior; it
754
- // fails to preserve the side-effects of atomics and volatile
755
- // accesses in LLVM IR to local / const / param, causing
756
- // well-formed LLVM-IR & CUDA C++ programs to be miscompiled
757
- // in sm_70+.
760
+ // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
761
+ // PTX instructions fails to preserve their C++ side-effects.
762
+ //
763
+ // Example (https://github.com/llvm/llvm-project/issues/62057):
764
+ //
765
+ // void example() {
766
+ // std::atomic<bool> True = true;
767
+ // while (True.load(std::memory_order_relaxed));
768
+ // }
769
+ //
770
+ // A C++ program that calls "example" is well-defined: the infinite loop
771
+ // performs an atomic operation. By lowering volatile/atomics to
772
+ // "weak" memory operations, we are transforming the above into:
773
+ //
774
+ // void undefined_behavior() {
775
+ // bool True = true;
776
+ // while (True);
777
+ // }
778
+ //
779
+ // which exhibits undefined behavior in both C++ and PTX.
780
+ //
781
+ // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
782
+ // behavior due to lack of Independent Forward Progress. Lowering these
783
+ // to weak memory operations in sm_60- is therefore fine.
784
+ //
785
+ // TODO: lower atomic and volatile operatios to memory locations
786
+ // in local, const, and param to two PTX operations in sm_70+:
787
+ // - the "weak" memory operation we are currently lowering to, and
788
+ // - some other memory operation that preserves the side-effect, e.g.,
789
+ // a dummy volatile load.
758
790
759
791
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
760
792
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
@@ -835,17 +867,15 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
835
867
}
836
868
case AtomicOrdering::SequentiallyConsistent:
837
869
case AtomicOrdering::Unordered:
838
- default : {
839
870
// TODO: support AcquireRelease and SequentiallyConsistent
840
871
SmallString<256 > Msg;
841
872
raw_svector_ostream OS (Msg);
842
873
OS << " NVPTX backend does not support AtomicOrdering \" "
843
874
<< toIRString (Ordering) << " \" yet." ;
844
875
report_fatal_error (OS.str ());
845
876
}
846
- }
847
877
848
- report_fatal_error ( " unreachable " );
878
+ llvm_unreachable ( " unexpected unhandled case " );
849
879
}
850
880
851
881
static bool canLowerToLDG (MemSDNode *N, const NVPTXSubtarget &Subtarget,
0 commit comments