AMDGPU: Do not bitcast atomicrmw in IR

arsenm · arsenm · commit 8088aab1c8c8 · 2024-05-06T10:27:37.000+02:00
This is the first step to eliminating shouldCastAtomicRMWIInIR. This
and the other atomic expand casting hooks should be removed. This adds
duplicate legalization machinery and interfaces. This is already what
codegen is supposed to do, and already does for the promotion case.

In the case of atomicrmw xchg, there seems to be some benefit to having the
bitcasts moved outside of the cmpxchg loop on targets with separate int and FP
registers, which we should be able to deal with by directly checking for
the legality of the underlying operation.

The casting path was also losing metadata when it recreated the instruction.
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -909,9 +909,10 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
   Value *ValOperand_Shifted = nullptr;
   if (Op == AtomicRMWInst::Xchg || Op == AtomicRMWInst::Add ||
       Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Nand) {
+    Value *ValOp = Builder.CreateBitCast(AI->getValOperand(), PMV.IntValueType);
     ValOperand_Shifted =
-        Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
-                          PMV.ShiftAmt, "ValOperand_Shifted");
+        Builder.CreateShl(Builder.CreateZExt(ValOp, PMV.WordType), PMV.ShiftAmt,
+                          "ValOperand_Shifted");
   }
 
   auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5988,6 +5988,13 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   case AtomicRMWInst::FMax:
   case AtomicRMWInst::FMin:
     return AtomicExpansionKind::CmpXChg;
+  case AtomicRMWInst::Xchg: {
+    const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
+    unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
+    if (ValSize == 32 || ValSize == 64)
+      return AtomicExpansionKind::None;
+    return AtomicExpansionKind::CmpXChg;
+  }
   default: {
     if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
       unsigned Size = IntTy->getBitWidth();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -236,6 +236,10 @@ class AMDGPUTargetLowering : public TargetLowering {
     return AtomicExpansionKind::None;
   }
 
+  AtomicExpansionKind shouldCastAtomicRMWIInIR(AtomicRMWInst *) const override {
+    return AtomicExpansionKind::None;
+  }
+
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -16,9 +16,7 @@
 define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float %value) {
 ; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i32 [[TMP2]] to float
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4
 ; COMMON-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst
@@ -29,9 +27,7 @@ define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float
 define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
 ; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i32 [[TMP2]] to float
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
 ; COMMON-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -42,9 +38,7 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memo
 define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
 ; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i32 [[TMP2]] to float
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory.access [[META0]]
 ; COMMON-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
@@ -55,9 +49,7 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(ptr
 define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
 ; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast float [[VALUE]] to i32
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i32 [[TMP1]] seq_cst, align 4
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i32 [[TMP2]] to float
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
 ; COMMON-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -268,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
 ;
 ; GFX940-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
 ; GFX940-NEXT:    ret float [[RES]]
 ;
 ; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
@@ -3713,5 +3705,19 @@ attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" }
 
 !0 = !{}
 ;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
 ; GFX940: [[META0]] = !{}
 ;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -16,9 +16,7 @@
 define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, double %value) {
 ; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i64 [[TMP2]] to double
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8
 ; COMMON-NEXT:    ret double [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst
@@ -29,9 +27,7 @@ define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, doub
 define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
 ; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i64 [[TMP2]] to double
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
 ; COMMON-NEXT:    ret double [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -42,9 +38,7 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_mem
 define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
 ; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i64 [[TMP2]] to double
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory.access [[META0]]
 ; COMMON-NEXT:    ret double [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
@@ -55,9 +49,7 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(pt
 define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
 ; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
 ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = bitcast double [[VALUE]] to i64
-; COMMON-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], i64 [[TMP1]] seq_cst, align 8
-; COMMON-NEXT:    [[RES:%.*]] = bitcast i64 [[TMP2]] to double
+; COMMON-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory.access [[META0]]
 ; COMMON-NEXT:    ret double [[RES]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -268,7 +260,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
 ;
 ; GFX940-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
 ; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
 ; GFX940-NEXT:    ret double [[RES]]
 ;
 ; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
@@ -1681,5 +1673,19 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
 
 !0 = !{}
 ;.
+; GFX803: [[META0]] = !{}
+;.
+; GFX906: [[META0]] = !{}
+;.
+; GFX908: [[META0]] = !{}
+;.
+; GFX90A: [[META0]] = !{}
+;.
 ; GFX940: [[META0]] = !{}
 ;.
+; GFX10: [[META0]] = !{}
+;.
+; GFX11: [[META0]] = !{}
+;.
+; GFX12: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
@@ -697,15 +697,15 @@ define i16 @test_atomicrmw_dec_i16_flat_system_align4(ptr %ptr, i16 %value) {
 
 define half @test_atomicrmw_xchg_f16_global_system(ptr addrspace(1) %ptr, half %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_f16_global_system(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -752,15 +752,15 @@ define half @test_atomicrmw_xchg_f16_global_system_align4(ptr addrspace(1) %ptr,
 
 define half @test_atomicrmw_xchg_f16_flat_system(ptr %ptr, half %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_f16_flat_system(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -807,15 +807,15 @@ define half @test_atomicrmw_xchg_f16_flat_system_align4(ptr %ptr, half %value) {
 
 define bfloat @test_atomicrmw_xchg_bf16_flat_system(ptr %ptr, bfloat %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_bf16_flat_system(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -1016,15 +1016,15 @@ define i16 @test_atomicrmw_dec_i16_flat_agent_align4(ptr %ptr, i16 %value) {
 
 define half @test_atomicrmw_xchg_f16_global_agent(ptr addrspace(1) %ptr, half %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_f16_global_agent(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1071,15 +1071,15 @@ define half @test_atomicrmw_xchg_f16_global_agent_align4(ptr addrspace(1) %ptr,
 
 define half @test_atomicrmw_xchg_f16_flat_agent(ptr %ptr, half %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_f16_flat_agent(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -1126,15 +1126,15 @@ define half @test_atomicrmw_xchg_f16_flat_agent_align4(ptr %ptr, half %value) {
 
 define bfloat @test_atomicrmw_xchg_bf16_global_agent(ptr addrspace(1) %ptr, bfloat %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_bf16_global_agent(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast bfloat [[VALUE:%.*]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]

Original file line number	Diff line number	Diff line change
`@@ -236,6 +236,10 @@ class AMDGPUTargetLowering : public TargetLowering {`
`236`	`236`	`return AtomicExpansionKind::None;`
`237`	`237`	`}`
`238`	`238`
	`239`	`+ AtomicExpansionKind shouldCastAtomicRMWIInIR(AtomicRMWInst *) const override {`
	`240`	`+ return AtomicExpansionKind::None;`
	`241`	`+ }`
	`242`	`+`
`239`	`243`	`static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);`
`240`	`244`	`static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);`
`241`	`245`