AtomicExpand: Fix creating invalid ptrmask for fat pointers #94955

arsenm · 2024-06-10T10:40:04Z

The ptrmask intrinsic requires the integer mask to be the index size, not the pointer size.

llvmbot · 2024-06-10T10:40:28Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

The ptrmask intrinsic requires the integer mask to be the index size, not the pointer size.

Patch is 22.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94955.diff

4 Files Affected:

(modified) llvm/lib/CodeGen/AtomicExpandPass.cpp (+1-1)
(modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll (+104)
(modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll (+104)
(modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll (+130)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index d2b756e82964e..7728cc50fc9f9 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -765,7 +765,7 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
   assert(ValueSize < MinWordSize);
 
   PointerType *PtrTy = cast<PointerType>(Addr->getType());
-  IntegerType *IntTy = DL.getIntPtrType(Ctx, PtrTy->getAddressSpace());
+  IntegerType *IntTy = DL.getIndexType(Ctx, PtrTy->getAddressSpace());
   Value *PtrLSB;
 
   if (AddrAlign < MinWordSize) {
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index 0acb8f8d0fcf6..b8196cfcc3510 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -1262,6 +1262,110 @@ define bfloat @test_atomicrmw_xchg_bf16_global_agent_align4(ptr addrspace(1) %pt
   ret bfloat %res
 }
 
+define i16 @test_atomicrmw_xchg_i16_buffer_fat_agent(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i16_buffer_fat_agent(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw xchg ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_xchg_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i16_buffer_fat_agent_align4(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP4]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw xchg ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_add_i16_buffer_fat_agent(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_add_i16_buffer_fat_agent(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw add ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_add_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_add_i16_buffer_fat_agent_align4(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[NEW]], 65535
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw add ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+  ret i16 %res
+}
+
 !0 = !{}
 !1 = !{!"foo", !"bar"}
 !2 = !{!3}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
index 97651c8d23a1e..590ee63001615 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1608,3 +1608,107 @@ define i8 @test_atomicrmw_dec_i8_flat_agent_align4(ptr %ptr, i8 %value) {
   %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4
   ret i8 %res
 }
+
+define i8 @test_atomicrmw_xchg_i8_buffer_fat_agent(ptr addrspace(7) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i8_buffer_fat_agent(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+  %res = atomicrmw xchg ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomicrmw_xchg_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i8_buffer_fat_agent_align4(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP4]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+  %res = atomicrmw xchg ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4
+  ret i8 %res
+}
+
+define i8 @test_atomicrmw_add_i8_buffer_fat_agent(ptr addrspace(7) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_add_i8_buffer_fat_agent(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+  %res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomicrmw_add_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_add_i8_buffer_fat_agent_align4(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[NEW]], 255
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+;
+  %res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4
+  ret i8 %res
+}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 17318b2c62ca8..34c6cdfc8d9c1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -4669,6 +4669,136 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr,
   ret void
 }
 
+define half @buffer_atomicrmw_fadd_f16_agent(ptr addrspace(7) %ptr, half %f) {
+; ALL-LABEL: @buffer_atomicrmw_fadd_f16_agent(
+; ALL-NEXT:    [[P:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 4
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[P]], i32 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[P]] to i32
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; ALL-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[F:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
+; ALL-NEXT:    ret half [[TMP7]]
+;
+  %p = getelementptr half, ptr addrspace(7) %ptr, i32 4
+  %fadd = atomicrmw fadd ptr addrspace(7) %p, half %f syncscope("agent") seq_cst
+  ret half %fadd
+}
+
+define half @buffer_atomicrmw_fadd_f16_align4_agent(ptr addrspace(7) %ptr, half %f) {
+; ALL-LABEL: @buffer_atomicrmw_fadd_f16_align4_agent(
+; ALL-NEXT:    [[P:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 4
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(7) [[P]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; ALL-NEXT:    [[NEW:%.*]] = fadd half [[TMP2]], [[F:%.*]]
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast half [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[P]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
+; ALL-NEXT:    ret half [[TMP5]]
+;
+  %p = getelementptr half, ptr addrspace(7) %ptr, i32 4
+  %fadd = atomicrmw fadd ptr addrspace(7) %p, half %f syncscope("agent") seq_cst, align 4
+  ret half %fadd
+}
+
+define bfloat @buffer_atomicrmw_fadd_bf16_agent(ptr addrspace(7) %ptr, bfloat %f) {
+; ALL-LABEL: @buffer_atomicrmw_fadd_bf16_agent(
+; ALL-NEXT:    [[P:%.*]] = getelementptr bfloat, ptr addrspace(7) [[PTR:%.*]], i32 4
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[P]], i32 -4)
+; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[P]] to i32
+; ALL-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; ALL-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; ALL-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[F:%.*]]
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; ALL-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1...
[truncated]

arsenm · 2024-06-10T10:42:14Z

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll

@@ -1262,6 +1262,110 @@ define bfloat @test_atomicrmw_xchg_bf16_global_agent_align4(ptr addrspace(1) %pt
  ret bfloat %res
 }

+define i16 @test_atomicrmw_xchg_i16_buffer_fat_agent(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i16_buffer_fat_agent(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)


Not sure this is really sensible. I'm surprised this is i32 and not i64, and since this is zero extended, this is probably nonsense

The codegen seems correct, but mostly because of the pre-processing IR pass splits these into the i128 + i32 structs

This doesn't actually guarantee an aligned pointer, since a ptr addrspace(7) is a concat(i128 rsrc, i32 offset), or, more relevantly, an concat(i32 flags, i32 extent, i16 flags, i48/ptr base_address, i32 offset).

The address an atomic operates on is add(base_address, zext(offset)) for these pointers, so if base_address is unaligned, clearing the low bits of the offset won't do anything.

I'm not sure what, if anything, we'd want to do about this.

Unsigned offsets are so problematic. I guess we could interpret this as a mask on the base address instead? But for that, we would want this to be the 64/48-bit mask and not the 32-bit indexing type

AMD only supports unsigned offsets (unless you count those weird SRC2 LDS instructions), but they're also never larger than 32 bits.

Speaking of LDS, does any of this change for those instructions? IIUC, the AtomicExpansion works for both.

global/scratch instructions are supposed to support signed offsets, but it was broken for a few generations.

None of this matters for LDS or those cases since they are normal pointers

tschuett · 2024-06-10T10:48:22Z

For fat pointers, the type is more complicated. You have to consider the address part and the other components.

tschuett · 2024-06-10T10:49:02Z

Or forbid ptrmask for fat pointers?

arsenm · 2024-06-10T10:51:43Z

Or forbid ptrmask for fat pointers?

I do need this to codegen the sub-dword atomicrmw

arsenm · 2024-06-10T10:53:25Z

For fat pointers, the type is more complicated. You have to consider the address part and the other components.

Right, that's supposed to expressed by the index type. Looking at the codegen for p7/p8 fat pointers, I think this actually works as expected

tschuett · 2024-06-10T11:12:21Z

Is the address part guaranteed to be the lower bits?

arsenm · 2024-06-10T11:19:25Z

Is the address part guaranteed to be the lower bits?

It shouldn't be, but a target that wants something different should deal with that in codegen

jyknight · 2024-06-10T15:03:59Z

I'm not familiar with the intended semantics of a fat pointer. Does llvm.ptrmask not actually work to align a fat-pointer? What does "ptrtoint" mean on one of these fat pointers?

arsenm · 2024-06-10T15:08:34Z

What does "ptrtoint" mean on one of these fat pointers?

Originally, it was a verifier error but that changed in ac81cb7

tschuett · 2024-06-10T15:12:48Z

Should you invoke an AMD intrinsic to align a potentially fat pointer instead of using ptrmask?

jrtc27 · 2024-06-10T15:15:17Z

Part of the whole point behind ptrmask’s existence is to work with fat pointers

jrtc27 · 2024-06-10T15:17:50Z

Anyway, I encourage people in this thread to go and read LangRef (https://llvm.org/docs/LangRef.html#llvm-ptrmask-intrinsic), which is clear on the matter

tschuett · 2024-06-10T15:18:28Z

Langref makes me believe that only masks the lower bits. That might not work for all fat pointers.

jrtc27 · 2024-06-10T15:25:32Z

It’s written to the IR model that the address is in the low bits, because that’s a sensible thing to do. Even if your hardware doesn’t actually do that you can still model it in the IR as doing so and move bits around as needed in the backend. And if someone really wants to expose hardware that doesn’t do that to the IR they can be the one to update LangRef to be more general. Fundamentally ptrmask is about masking the address, it’s just written that way because it’s simplifying.

krzysz00 · 2024-06-10T15:28:31Z

Right, but ptr addrspace(7) isn't a fat pointer in the CHERI [context] (+) [address] style. The point is to track the offset to a ptr addrspace(8) (aka V#), which has a base address in its lower 48 bits ... but the hardware doesn't want you operating on that address. You're supposed to construct a 32-bit offset instead, which is entirely separate (for one thing, a V# must be wave-uniform, the offst doesn't need to be).

jrtc27 · 2024-06-10T15:29:55Z

In fact, our original CHERI-MIPS was an example of exactly that. Our old 256-bit capability representation had the address in the middle of the representation, but this scheme would have lowered to it just fine.

krzysz00 · 2024-06-10T15:30:22Z

@arsenm That being said, this gets much simpler if we say "the base part of a ptr addrspace(7)'s buffer descriptor must be word-aligned on pain of UB" by analogy with image descriptors

jrtc27 · 2024-06-10T15:31:57Z

Right, but ptr addrspace(7) isn't a fat pointer in the CHERI [context] (+) [address] style. The point is to track the offset to a ptr addrspace(8) (aka V#), which has a base address in its lower 48 bits ... but the hardware doesn't want you operating on that address. You're supposed to construct a 32-bit offset instead, which is entirely separate (for one thing, a V# must be wave-uniform, the offst doesn't need to be).

I can’t speak for AMDGPU specifics, just that this code change is correct with respect to LangRef, and that LangRef’s model works for CHERI’s fat pointers.

arsenm · 2024-06-10T15:36:55Z

I can’t speak for AMDGPU specifics, just that this code change is correct with respect to LangRef, and that LangRef’s model works for CHERI’s fat pointers.

Right, this currently avoids a verifier error and in theory should work. Whether the system as a whole works is an AMDGPU codegen issue for another time

jyknight · 2024-06-10T15:47:57Z

Right, this currently avoids a verifier error and in theory should work.

It's certainly clear that getIndexType is the correct type for ptrmask. So, this change is definitely "more correct" in that regards. But it's less clear to me (yet) that this code will actually work on architectures where there's a relevant difference.

It’s written to the IR model that the address is in the low bits

Where is that in the LangRef?

I always thought it should be the case, but I don't see where it says that it is. The behavior of ptrmask is specified in terms of ptrtoint. And ptrtoint doesn't seem to say anything about the low-bits of the result representing the low-bits of the underlying address?

arsenm · 2024-06-10T15:53:57Z

The intention is certainly to handle anything exotic, but the current SelectionDAGBuilder implementation just inserts an extend with and, which a target would need some way of overriding.

I also suspect this extension code in the DAGBuilder is dead code. I believe the verifier check that the mask type matches the index type is new since I last looked at this code

jrtc27 · 2024-06-10T16:04:06Z

Right, this currently avoids a verifier error and in theory should work.

It's certainly clear that getIndexType is the correct type for ptrmask. So, this change is definitely "more correct" in that regards. But it's less clear to me (yet) that this code will actually work on architectures where there's a relevant difference.

It’s written to the IR model that the address is in the low bits

Where is that in the LangRef?

I always thought it should be the case, but I don't see where it says that it is. The behavior of ptrmask is specified in terms of ptrtoint. And ptrtoint doesn't seem to say anything about the low-bits of the result representing the low-bits of the underlying address?

There's a lot that isn't there that should be, but that shouldn't prevent us from incremental improvement like this. In terms of ptrtoint being used, that's just because there is no standard LLVM way to get just the address without using the complication that is ptrtoint's semantics.

jyknight · 2024-06-10T17:28:45Z

OK -- well, the 1-line change to AtomicExpand LGTM.

I'll defer to AMDGPU experts about the AMDGPU tests. If those are controversial or need more work, I'd also be OK with just taking the 1-line patch without those tests.

arsenm · 2024-06-11T12:15:59Z

I'll defer to AMDGPU experts about the AMDGPU tests. If those are controversial or need more work, I'd also be OK with just taking the 1-line patch without those tests.

Even if the codegen were wrong here, step 1 would be to precommit these showing it's wrong anyway

arichardson

The change looks good to me and matches my understanding of what should be more explicit in the LangRef.

A while ago I tried to make the index<->address relationship more explicit (https://reviews.llvm.org/D135158), but there was no consensus to land that.

AtomicExpand: Fix creating invalid ptrmask for fat pointers

8f75750

The ptrmask intrinsic requires the integer mask to be the index size, not the pointer size.

arsenm added backend:AMDGPU llvm:codegen labels Jun 10, 2024

arsenm requested review from jayfoad, davidchisnall, krzysz00, jrtc27, jyknight, preames, arichardson and piotrAMD June 10, 2024 10:40

llvmbot added the llvm:transforms label Jun 10, 2024

arsenm commented Jun 10, 2024

View reviewed changes

AMDGPU: Add some codegen tests for ptrmask with fat pointers

f9472f4

AMDGPU: Add more tests for ptrmask with fat pointers

1b0e274

arichardson approved these changes Jun 11, 2024

View reviewed changes

Merge branch 'main' into atomic-expand-fat-pointers

181835b

arsenm merged commit f3afdc4 into llvm:main Jun 12, 2024
5 of 6 checks passed

arsenm deleted the atomic-expand-fat-pointers branch June 12, 2024 08:45

AtomicExpand: Fix creating invalid ptrmask for fat pointers #94955

AtomicExpand: Fix creating invalid ptrmask for fat pointers #94955

Conversation

arsenm commented Jun 10, 2024

Uh oh!

llvmbot commented Jun 10, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm Jun 10, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Jun 10, 2024

Choose a reason for hiding this comment

Uh oh!

krzysz00 Jun 10, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Jun 10, 2024

Choose a reason for hiding this comment

Uh oh!

bfavela Jun 10, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Jun 11, 2024

Choose a reason for hiding this comment

Uh oh!

tschuett commented Jun 10, 2024

Uh oh!

tschuett commented Jun 10, 2024

Uh oh!

arsenm commented Jun 10, 2024

Uh oh!

arsenm commented Jun 10, 2024

Uh oh!

tschuett commented Jun 10, 2024

Uh oh!

arsenm commented Jun 10, 2024

Uh oh!

jyknight commented Jun 10, 2024

Uh oh!

arsenm commented Jun 10, 2024

Uh oh!

tschuett commented Jun 10, 2024

Uh oh!

jrtc27 commented Jun 10, 2024

Uh oh!

jrtc27 commented Jun 10, 2024

Uh oh!

tschuett commented Jun 10, 2024

Uh oh!

jrtc27 commented Jun 10, 2024

Uh oh!

krzysz00 commented Jun 10, 2024

Uh oh!

jrtc27 commented Jun 10, 2024

Uh oh!

krzysz00 commented Jun 10, 2024

Uh oh!

jrtc27 commented Jun 10, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Jun 10, 2024

Uh oh!

jyknight commented Jun 10, 2024

Uh oh!

arsenm commented Jun 10, 2024

Uh oh!

jrtc27 commented Jun 10, 2024

Uh oh!

jyknight commented Jun 10, 2024

Uh oh!

arsenm commented Jun 11, 2024

Uh oh!

arichardson left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Jun 10, 2024 •

edited

Loading

jrtc27 commented Jun 10, 2024 •

edited

Loading