Skip to content

Commit cc94ebe

Browse files
committed
AMDGPU: Do not bitcast atomicrmw in IR
This is the first step to eliminating shouldCastAtomicRMWIInIR. This and the other atomic expand casting hooks should be removed. This adds duplicate legalization machinery and interfaces. This is already what codegen is supposed to do, and already does for the promotion case. In the case of atomicrmw xchg, there seems to be some benefit to having the bitcasts moved outside of the cmpxchg loop on targets with separate int and FP registers, which we should be able to deal with by directly checking for the legality of the underlying operation. The casting path was also losing metadata when it recreated the instruction.
1 parent 38f9c01 commit cc94ebe

File tree

7 files changed

+88
-64
lines changed

7 files changed

+88
-64
lines changed

llvm/lib/CodeGen/AtomicExpandPass.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -909,9 +909,10 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
909909
Value *ValOperand_Shifted = nullptr;
910910
if (Op == AtomicRMWInst::Xchg || Op == AtomicRMWInst::Add ||
911911
Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Nand) {
912+
Value *ValOp = Builder.CreateBitCast(AI->getValOperand(), PMV.IntValueType);
912913
ValOperand_Shifted =
913-
Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
914-
PMV.ShiftAmt, "ValOperand_Shifted");
914+
Builder.CreateShl(Builder.CreateZExt(ValOp, PMV.WordType), PMV.ShiftAmt,
915+
"ValOperand_Shifted");
915916
}
916917

917918
auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5988,6 +5988,13 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
59885988
case AtomicRMWInst::FMax:
59895989
case AtomicRMWInst::FMin:
59905990
return AtomicExpansionKind::CmpXChg;
5991+
case AtomicRMWInst::Xchg: {
5992+
const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
5993+
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
5994+
if (ValSize == 32 || ValSize == 64)
5995+
return AtomicExpansionKind::None;
5996+
return AtomicExpansionKind::CmpXChg;
5997+
}
59915998
default: {
59925999
if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
59936000
unsigned Size = IntTy->getBitWidth();

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,10 @@ class AMDGPUTargetLowering : public TargetLowering {
236236
return AtomicExpansionKind::None;
237237
}
238238

239+
AtomicExpansionKind shouldCastAtomicRMWIInIR(AtomicRMWInst *) const override {
240+
return AtomicExpansionKind::None;
241+
}
242+
239243
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
240244
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
241245

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float %value) {
1717
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system(
1818
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
19-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
20-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
21-
; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float
19+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4
2220
; COMMON-NEXT: ret float [[RES]]
2321
;
2422
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst
@@ -29,9 +27,7 @@ define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float
2927
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
3028
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(
3129
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
32-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
33-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
34-
; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float
30+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
3531
; COMMON-NEXT: ret float [[RES]]
3632
;
3733
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -42,9 +38,7 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memo
4238
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
4339
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_access(
4440
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
45-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
46-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
47-
; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float
41+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
4842
; COMMON-NEXT: ret float [[RES]]
4943
;
5044
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory.access !0
@@ -55,9 +49,7 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory_acc
5549
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, float %value) {
5650
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
5751
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
58-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
59-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
60-
; COMMON-NEXT: [[RES:%.*]] = bitcast i32 [[TMP2]] to float
52+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
6153
; COMMON-NEXT: ret float [[RES]]
6254
;
6355
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
@@ -268,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
268260
;
269261
; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
270262
; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
271-
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
263+
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
272264
; GFX940-NEXT: ret float [[RES]]
273265
;
274266
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
@@ -3713,5 +3705,19 @@ attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" }
37133705

37143706
!0 = !{}
37153707
;.
3708+
; GFX803: [[META0]] = !{}
3709+
;.
3710+
; GFX906: [[META0]] = !{}
3711+
;.
3712+
; GFX908: [[META0]] = !{}
3713+
;.
3714+
; GFX90A: [[META0]] = !{}
3715+
;.
37163716
; GFX940: [[META0]] = !{}
37173717
;.
3718+
; GFX10: [[META0]] = !{}
3719+
;.
3720+
; GFX11: [[META0]] = !{}
3721+
;.
3722+
; GFX12: [[META0]] = !{}
3723+
;.

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, double %value) {
1717
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system(
1818
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
19-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
20-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
21-
; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double
19+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8
2220
; COMMON-NEXT: ret double [[RES]]
2321
;
2422
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst
@@ -29,9 +27,7 @@ define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, doub
2927
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
3028
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(
3129
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
32-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
33-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
34-
; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double
30+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
3531
; COMMON-NEXT: ret double [[RES]]
3632
;
3733
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -42,9 +38,7 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_mem
4238
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
4339
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_access(
4440
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
45-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
46-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
47-
; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double
41+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
4842
; COMMON-NEXT: ret double [[RES]]
4943
;
5044
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory.access !0
@@ -55,9 +49,7 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory_ac
5549
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(ptr addrspace(1) %ptr, double %value) {
5650
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory_access(
5751
; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
58-
; COMMON-NEXT: [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
59-
; COMMON-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
60-
; COMMON-NEXT: [[RES:%.*]] = bitcast i64 [[TMP2]] to double
52+
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
6153
; COMMON-NEXT: ret double [[RES]]
6254
;
6355
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory.access !0
@@ -268,7 +260,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
268260
;
269261
; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
270262
; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
271-
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
263+
; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
272264
; GFX940-NEXT: ret double [[RES]]
273265
;
274266
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
@@ -1681,5 +1673,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
16811673

16821674
!0 = !{}
16831675
;.
1676+
; GFX803: [[META0]] = !{}
1677+
;.
1678+
; GFX906: [[META0]] = !{}
1679+
;.
1680+
; GFX908: [[META0]] = !{}
1681+
;.
1682+
; GFX90A: [[META0]] = !{}
1683+
;.
16841684
; GFX940: [[META0]] = !{}
16851685
;.
1686+
; GFX10: [[META0]] = !{}
1687+
;.
1688+
; GFX11: [[META0]] = !{}
1689+
;.
1690+
; GFX12: [[META0]] = !{}
1691+
;.

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -697,15 +697,15 @@ define i16 @test_atomicrmw_dec_i16_flat_system_align4(ptr %ptr, i16 %value) {
697697

698698
define half @test_atomicrmw_xchg_f16_global_system(ptr addrspace(1) %ptr, half %value) {
699699
; CHECK-LABEL: @test_atomicrmw_xchg_f16_global_system(
700-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
701700
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
702-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
703-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
704-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
705-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
701+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
702+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
703+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
704+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
706705
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
707706
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
708-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
707+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
708+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
709709
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
710710
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
711711
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -752,15 +752,15 @@ define half @test_atomicrmw_xchg_f16_global_system_align4(ptr addrspace(1) %ptr,
752752

753753
define half @test_atomicrmw_xchg_f16_flat_system(ptr %ptr, half %value) {
754754
; CHECK-LABEL: @test_atomicrmw_xchg_f16_flat_system(
755-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
756755
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
757-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
758-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
759-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
760-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
756+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
757+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
758+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
759+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
761760
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
762761
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
763-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
762+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
763+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
764764
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
765765
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
766766
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -807,15 +807,15 @@ define half @test_atomicrmw_xchg_f16_flat_system_align4(ptr %ptr, half %value) {
807807

808808
define bfloat @test_atomicrmw_xchg_bf16_flat_system(ptr %ptr, bfloat %value) {
809809
; CHECK-LABEL: @test_atomicrmw_xchg_bf16_flat_system(
810-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
811810
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
812-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
813-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
814-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
815-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
811+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
812+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
813+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
814+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
816815
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
817816
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
818-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
817+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
818+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
819819
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
820820
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
821821
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -902,15 +902,15 @@ define i16 @test_atomicrmw_dec_i16_flat_agent_align4(ptr %ptr, i16 %value) {
902902

903903
define half @test_atomicrmw_xchg_f16_global_agent(ptr addrspace(1) %ptr, half %value) {
904904
; CHECK-LABEL: @test_atomicrmw_xchg_f16_global_agent(
905-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
906905
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
907-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
908-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
909-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
910-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
906+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
907+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
908+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
909+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
911910
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
912911
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
913-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
912+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
913+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
914914
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
915915
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
916916
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -957,15 +957,15 @@ define half @test_atomicrmw_xchg_f16_global_agent_align4(ptr addrspace(1) %ptr,
957957

958958
define half @test_atomicrmw_xchg_f16_flat_agent(ptr %ptr, half %value) {
959959
; CHECK-LABEL: @test_atomicrmw_xchg_f16_flat_agent(
960-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
961960
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
962-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
963-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
964-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
965-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
961+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
962+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
963+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
964+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
966965
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
967966
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
968-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
967+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
968+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
969969
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
970970
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
971971
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -1012,15 +1012,15 @@ define half @test_atomicrmw_xchg_f16_flat_agent_align4(ptr %ptr, half %value) {
10121012

10131013
define bfloat @test_atomicrmw_xchg_bf16_global_agent(ptr addrspace(1) %ptr, bfloat %value) {
10141014
; CHECK-LABEL: @test_atomicrmw_xchg_bf16_global_agent(
1015-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
10161015
; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
1017-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
1018-
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
1019-
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
1020-
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
1016+
; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
1017+
; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
1018+
; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
1019+
; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
10211020
; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
10221021
; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
1023-
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
1022+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
1023+
; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
10241024
; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
10251025
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
10261026
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]

0 commit comments

Comments
 (0)