Skip to content

Commit 8088aab

Browse files
committed
AMDGPU: Do not bitcast atomicrmw in IR
This is the first step to eliminating shouldCastAtomicRMWIInIR. This and the other atomic expand casting hooks should be removed. This adds duplicate legalization machinery and interfaces. This is already what codegen is supposed to do, and already does for the promotion case. In the case of atomicrmw xchg, there seems to be some benefit to having the bitcasts moved outside of the cmpxchg loop on targets with separate int and FP registers, which we should be able to deal with by directly checking for the legality of the underlying operation. The casting path was also losing metadata when it recreated the instruction.
1 parent 30367cb commit 8088aab

File tree

7 files changed

+88
-64
lines changed

7 files changed

+88
-64
lines changed

llvm/lib/CodeGen/AtomicExpandPass.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -909,9 +909,10 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
909909
Value *ValOperand_Shifted = nullptr;
910910
if (Op == AtomicRMWInst::Xchg || Op == AtomicRMWInst::Add ||
911911
Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Nand) {
912+
Value *ValOp = Builder.CreateBitCast(AI->getValOperand(), PMV.IntValueType);
912913
ValOperand_Shifted =
913-
Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
914-
PMV.ShiftAmt, "ValOperand_Shifted");
914+
Builder.CreateShl(Builder.CreateZExt(ValOp, PMV.WordType), PMV.ShiftAmt,
915+
"ValOperand_Shifted");
915916
}
916917

917918
auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5988,6 +5988,13 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
59885988
case AtomicRMWInst::FMax:
59895989
case AtomicRMWInst::FMin:
59905990
return AtomicExpansionKind::CmpXChg;
5991+
case AtomicRMWInst::Xchg: {
5992+
const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
5993+
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
5994+
if (ValSize == 32 || ValSize == 64)
5995+
return AtomicExpansionKind::None;
5996+
return AtomicExpansionKind::CmpXChg;
5997+
}
59915998
default: {
59925999
if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
59936000
unsigned Size = IntTy->getBitWidth();

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,10 @@ class AMDGPUTargetLowering : public TargetLowering {
236236
return AtomicExpansionKind::None;
237237
}
238238

239+
AtomicExpansionKind shouldCastAtomicRMWIInIR(AtomicRMWInst *) const override {
240+
return AtomicExpansionKind::None;
241+
}
242+
239243
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
240244
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
241245

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float %value) {
1717
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system(
1818
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
19-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
20-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
21-
; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float
19+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4
2220
; COMMON-NEXT: ret float [[RES]]
2321
;
2422
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst
@@ -29,9 +27,7 @@ define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float
2927
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
3028
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(
3129
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
32-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
33-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
34-
; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float
30+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
3531
; COMMON-NEXT: ret float [[RES]]
3632
;
3733
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -42,9 +38,7 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memo
4238
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
4339
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(
4440
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
45-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
46-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
47-
; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float
41+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
4842
; COMMON-NEXT: ret float [[RES]]
4943
;
5044
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
@@ -55,9 +49,7 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(ptr
5549
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
5650
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
5751
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
58-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
59-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
60-
; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float
52+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
6153
; COMMON-NEXT: ret float [[RES]]
6254
;
6355
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -268,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
268260
;
269261
; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
270262
; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
271-
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
263+
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
272264
; GFX940-NEXT: ret float [[RES]]
273265
;
274266
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
@@ -3713,5 +3705,19 @@ attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" }
37133705

37143706
!0 = !{}
37153707
;.
3708+
; GFX803: [[META0]] = !{}
3709+
;.
3710+
; GFX906: [[META0]] = !{}
3711+
;.
3712+
; GFX908: [[META0]] = !{}
3713+
;.
3714+
; GFX90A: [[META0]] = !{}
3715+
;.
37163716
; GFX940: [[META0]] = !{}
37173717
;.
3718+
; GFX10: [[META0]] = !{}
3719+
;.
3720+
; GFX11: [[META0]] = !{}
3721+
;.
3722+
; GFX12: [[META0]] = !{}
3723+
;.

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, double %value) {
1717
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system(
1818
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
19-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
20-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
21-
; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double
19+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8
2220
; COMMON-NEXT: ret double [[RES]]
2321
;
2422
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst
@@ -29,9 +27,7 @@ define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, doub
2927
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
3028
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(
3129
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
32-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
33-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
34-
; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double
30+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
3531
; COMMON-NEXT: ret double [[RES]]
3632
;
3733
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -42,9 +38,7 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_mem
4238
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
4339
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(
4440
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
45-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
46-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
47-
; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double
41+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
4842
; COMMON-NEXT: ret double [[RES]]
4943
;
5044
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
@@ -55,9 +49,7 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(pt
5549
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
5650
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
5751
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
58-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
59-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
60-
; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double
52+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
6153
; COMMON-NEXT: ret double [[RES]]
6254
;
6355
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -268,7 +260,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
268260
;
269261
; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
270262
; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
271-
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
263+
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
272264
; GFX940-NEXT: ret double [[RES]]
273265
;
274266
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
@@ -1681,5 +1673,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
16811673

16821674
!0 = !{}
16831675
;.
1676+
; GFX803: [[META0]] = !{}
1677+
;.
1678+
; GFX906: [[META0]] = !{}
1679+
;.
1680+
; GFX908: [[META0]] = !{}
1681+
;.
1682+
; GFX90A: [[META0]] = !{}
1683+
;.
16841684
; GFX940: [[META0]] = !{}
16851685
;.
1686+
; GFX10: [[META0]] = !{}
1687+
;.
1688+
; GFX11: [[META0]] = !{}
1689+
;.
1690+
; GFX12: [[META0]] = !{}
1691+
;.

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -697,15 +697,15 @@ define i16 @test_atomicrmw_dec_i16_flat_system_align4(ptr %ptr, i16 %value) {
697697

698698
define half @test_atomicrmw_xchg_f16_global_system(ptr addrspace(1) %ptr, half %value) {
699699
; CHECK-LABEL: @test_atomicrmw_xchg_f16_global_system(
700-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
701700
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
702-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
703-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
704-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
705-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
701+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
702+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
703+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
704+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
706705
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
707706
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
708-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
707+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
708+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
709709
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
710710
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
711711
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -752,15 +752,15 @@ define half @test_atomicrmw_xchg_f16_global_system_align4(ptr addrspace(1) %ptr,
752752

753753
define half @test_atomicrmw_xchg_f16_flat_system(ptr %ptr, half %value) {
754754
; CHECK-LABEL: @test_atomicrmw_xchg_f16_flat_system(
755-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
756755
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
757-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
758-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
759-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
760-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
756+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
757+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
758+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
759+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
761760
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
762761
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
763-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
762+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
763+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
764764
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
765765
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
766766
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -807,15 +807,15 @@ define half @test_atomicrmw_xchg_f16_flat_system_align4(ptr %ptr, half %value) {
807807

808808
define bfloat @test_atomicrmw_xchg_bf16_flat_system(ptr %ptr, bfloat %value) {
809809
; CHECK-LABEL: @test_atomicrmw_xchg_bf16_flat_system(
810-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
811810
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
812-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
813-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
814-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
815-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
811+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
812+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
813+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
814+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
816815
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
817816
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
818-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
817+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
818+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
819819
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
820820
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
821821
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1016,15 +1016,15 @@ define i16 @test_atomicrmw_dec_i16_flat_agent_align4(ptr %ptr, i16 %value) {
10161016

10171017
define half @test_atomicrmw_xchg_f16_global_agent(ptr addrspace(1) %ptr, half %value) {
10181018
; CHECK-LABEL: @test_atomicrmw_xchg_f16_global_agent(
1019-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
10201019
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
1021-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
1022-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
1023-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
1024-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
1020+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
1021+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
1022+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
1023+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
10251024
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
10261025
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
1027-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
1026+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
1027+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
10281028
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
10291029
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
10301030
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -1071,15 +1071,15 @@ define half @test_atomicrmw_xchg_f16_global_agent_align4(ptr addrspace(1) %ptr,
10711071

10721072
define half @test_atomicrmw_xchg_f16_flat_agent(ptr %ptr, half %value) {
10731073
; CHECK-LABEL: @test_atomicrmw_xchg_f16_flat_agent(
1074-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
10751074
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
1076-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
1077-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
1078-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
1079-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
1075+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
1076+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
1077+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
1078+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
10801079
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
10811080
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
1082-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
1081+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
1082+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
10831083
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
10841084
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
10851085
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -1126,15 +1126,15 @@ define half @test_atomicrmw_xchg_f16_flat_agent_align4(ptr %ptr, half %value) {
11261126

11271127
define bfloat @test_atomicrmw_xchg_bf16_global_agent(ptr addrspace(1) %ptr, bfloat %value) {
11281128
; CHECK-LABEL: @test_atomicrmw_xchg_bf16_global_agent(
1129-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
11301129
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
1131-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
1132-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
1133-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
1134-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
1130+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
1131+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
1132+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
1133+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
11351134
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
11361135
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
1137-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
1136+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
1137+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
11381138
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
11391139
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
11401140
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]

0 commit comments

Comments
 (0)