Skip to content

[LoongArch] Set isAsCheapAsAMove on ADDI.D and {X}ORI instructions #94733

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 12, 2024

Conversation

heiher
Copy link
Member

@heiher heiher commented Jun 7, 2024

No description provided.

@heiher heiher requested review from wangleiat and SixWeining June 7, 2024 07:40
@heiher heiher self-assigned this Jun 7, 2024
@llvmbot
Copy link
Member

llvmbot commented Jun 7, 2024

@llvm/pr-subscribers-backend-loongarch

Author: hev (heiher)

Changes

Patch is 140.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94733.diff

9 Files Affected:

  • (modified) llvm/lib/Target/LoongArch/LoongArchInstrInfo.td (+5-5)
  • (modified) llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll (+1-1)
  • (modified) llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll (+10-13)
  • (modified) llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll (+628-1168)
  • (modified) llvm/test/CodeGen/LoongArch/jump-table.ll (+56-50)
  • (modified) llvm/test/CodeGen/LoongArch/register-coalescer-crash-pr79718.mir (+1-1)
  • (modified) llvm/test/CodeGen/LoongArch/sextw-removal.ll (+38-38)
  • (modified) llvm/test/CodeGen/LoongArch/vararg.ll (+4-4)
  • (modified) llvm/test/CodeGen/LoongArch/vector-fp-imm.ll (+90-97)
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index f72f46e39e2a4..f4bdd7d1209cf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -734,7 +734,7 @@ def ADD_W : ALU_3R<0x00100000>;
 def SUB_W : ALU_3R<0x00110000>;
 def ADDI_W : ALU_2RI12<0x02800000, simm12_addlike>;
 def ALSL_W : ALU_3RI2<0x00040000, uimm2_plus1>;
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def LU12I_W : ALU_1RI20<0x14000000, simm20_lu12iw>;
 }
 def SLT  : ALU_3R<0x00120000>;
@@ -751,7 +751,7 @@ def XOR  : ALU_3R<0x00158000>;
 def ANDN : ALU_3R<0x00168000>;
 def ORN  : ALU_3R<0x00160000>;
 def ANDI : ALU_2RI12<0x03400000, uimm12>;
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def ORI  : ALU_2RI12<0x03800000, uimm12_ori>;
 def XORI : ALU_2RI12<0x03c00000, uimm12>;
 }
@@ -858,7 +858,7 @@ def ADD_D : ALU_3R<0x00108000>;
 def SUB_D : ALU_3R<0x00118000>;
 // ADDI_D isn't always rematerializable, but isReMaterializable will be used as
 // a hint which is verified in isReallyTriviallyReMaterializable.
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def ADDI_D : ALU_2RI12<0x02c00000, simm12_addlike>;
 }
 def ADDU16I_D : ALU_2RI16<0x10000000, simm16>;
@@ -866,12 +866,12 @@ def ALSL_WU : ALU_3RI2<0x00060000, uimm2_plus1>;
 def ALSL_D  : ALU_3RI2<0x002c0000, uimm2_plus1>;
 let Constraints = "$rd = $dst" in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
-    isReMaterializable = 1 in
+    isReMaterializable = 1, isAsCheapAsAMove = 1 in
 def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),
                        (ins GPR:$rd, simm20_lu32id:$imm20),
                        "$rd, $imm20">;
 }
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>;
 }
 def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
index 5e89fd41d6a7c..cc6ba057019c6 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
@@ -28,9 +28,9 @@ define i64 @caller_float_in_fpr() nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT:    ori $a0, $zero, 1
 ; CHECK-NEXT:    movgr2fr.w $fa0, $zero
 ; CHECK-NEXT:    movgr2fr.d $fa1, $zero
+; CHECK-NEXT:    ori $a0, $zero, 1
 ; CHECK-NEXT:    bl %plt(callee_float_in_fpr)
 ; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
diff --git a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
index a3d85f6bd06d8..49155a3966a84 100644
--- a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+++ b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
@@ -16,7 +16,6 @@ define dso_local noundef signext i32 @main() nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi.d $sp, $sp, -272
 ; CHECK-NEXT:    st.d $ra, $sp, 264 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 256 # 8-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
 ; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
@@ -33,25 +32,23 @@ define dso_local noundef signext i32 @main() nounwind {
 ; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_3)
 ; CHECK-NEXT:    xvld $xr3, $a0, 0
 ; CHECK-NEXT:    xvst $xr3, $sp, 0 # 32-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 128
-; CHECK-NEXT:    xvst $xr1, $sp, 160
-; CHECK-NEXT:    xvst $xr2, $sp, 192
-; CHECK-NEXT:    xvst $xr3, $sp, 224
-; CHECK-NEXT:    addi.d $fp, $sp, 128
-; CHECK-NEXT:    move $a0, $fp
+; CHECK-NEXT:    xvst $xr0, $sp, 136
+; CHECK-NEXT:    xvst $xr1, $sp, 168
+; CHECK-NEXT:    xvst $xr2, $sp, 200
+; CHECK-NEXT:    xvst $xr3, $sp, 232
+; CHECK-NEXT:    addi.d $a0, $sp, 136
 ; CHECK-NEXT:    bl %plt(foo)
 ; CHECK-NEXT:    xvld $xr0, $sp, 96 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 128
+; CHECK-NEXT:    xvst $xr0, $sp, 136
 ; CHECK-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 160
+; CHECK-NEXT:    xvst $xr0, $sp, 168
 ; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 192
+; CHECK-NEXT:    xvst $xr0, $sp, 200
 ; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 224
-; CHECK-NEXT:    move $a0, $fp
+; CHECK-NEXT:    xvst $xr0, $sp, 232
+; CHECK-NEXT:    addi.d $a0, $sp, 136
 ; CHECK-NEXT:    bl %plt(bar)
 ; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $fp, $sp, 256 # 8-byte Folded Reload
 ; CHECK-NEXT:    ld.d $ra, $sp, 264 # 8-byte Folded Reload
 ; CHECK-NEXT:    addi.d $sp, $sp, 272
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index 81cc29419a0e0..ea110e8a90c13 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -297,95 +297,68 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 define double @double_fadd_acquire(ptr %p) nounwind {
 ; LA64F-LABEL: double_fadd_acquire:
 ; LA64F:       # %bb.0:
-; LA64F-NEXT:    addi.d $sp, $sp, -80
-; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
+; LA64F-NEXT:    addi.d $sp, $sp, -48
+; LA64F-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64F-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64F-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
 ; LA64F-NEXT:    move $fp, $a0
-; LA64F-NEXT:    ld.d $s5, $a0, 0
-; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
-; LA64F-NEXT:    ori $s1, $zero, 8
-; LA64F-NEXT:    addi.d $s2, $sp, 8
-; LA64F-NEXT:    addi.d $s3, $sp, 0
-; LA64F-NEXT:    ori $s4, $zero, 2
+; LA64F-NEXT:    ld.d $s0, $a0, 0
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB4_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64F-NEXT:    move $a0, $s5
-; LA64F-NEXT:    move $a1, $s0
+; LA64F-NEXT:    lu52i.d $a1, $zero, 1023
+; LA64F-NEXT:    move $a0, $s0
 ; LA64F-NEXT:    bl %plt(__adddf3)
-; LA64F-NEXT:    st.d $s5, $sp, 8
-; LA64F-NEXT:    st.d $a0, $sp, 0
-; LA64F-NEXT:    move $a0, $s1
+; LA64F-NEXT:    st.d $s0, $sp, 16
+; LA64F-NEXT:    st.d $a0, $sp, 8
+; LA64F-NEXT:    ori $a0, $zero, 8
+; LA64F-NEXT:    addi.d $a2, $sp, 16
+; LA64F-NEXT:    addi.d $a3, $sp, 8
+; LA64F-NEXT:    ori $a4, $zero, 2
+; LA64F-NEXT:    ori $a5, $zero, 2
 ; LA64F-NEXT:    move $a1, $fp
-; LA64F-NEXT:    move $a2, $s2
-; LA64F-NEXT:    move $a3, $s3
-; LA64F-NEXT:    move $a4, $s4
-; LA64F-NEXT:    move $a5, $s4
 ; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
-; LA64F-NEXT:    ld.d $s5, $sp, 8
+; LA64F-NEXT:    ld.d $s0, $sp, 16
 ; LA64F-NEXT:    beqz $a0, .LBB4_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64F-NEXT:    move $a0, $s5
-; LA64F-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64F-NEXT:    addi.d $sp, $sp, 80
+; LA64F-NEXT:    move $a0, $s0
+; LA64F-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64F-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64F-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64F-NEXT:    addi.d $sp, $sp, 48
 ; LA64F-NEXT:    ret
 ;
 ; LA64D-LABEL: double_fadd_acquire:
 ; LA64D:       # %bb.0:
-; LA64D-NEXT:    addi.d $sp, $sp, -80
-; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
+; LA64D-NEXT:    addi.d $sp, $sp, -48
+; LA64D-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64D-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
 ; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    addi.d $a0, $zero, 1
 ; LA64D-NEXT:    movgr2fr.d $fa1, $a0
 ; LA64D-NEXT:    ffint.d.l $fs0, $fa1
-; LA64D-NEXT:    ori $s0, $zero, 8
-; LA64D-NEXT:    addi.d $s1, $sp, 16
-; LA64D-NEXT:    addi.d $s2, $sp, 8
-; LA64D-NEXT:    ori $s3, $zero, 2
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB4_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64D-NEXT:    fadd.d $fa1, $fa0, $fs0
 ; LA64D-NEXT:    fst.d $fa0, $sp, 16
 ; LA64D-NEXT:    fst.d $fa1, $sp, 8
-; LA64D-NEXT:    move $a0, $s0
+; LA64D-NEXT:    ori $a0, $zero, 8
+; LA64D-NEXT:    addi.d $a2, $sp, 16
+; LA64D-NEXT:    addi.d $a3, $sp, 8
+; LA64D-NEXT:    ori $a4, $zero, 2
+; LA64D-NEXT:    ori $a5, $zero, 2
 ; LA64D-NEXT:    move $a1, $fp
-; LA64D-NEXT:    move $a2, $s1
-; LA64D-NEXT:    move $a3, $s2
-; LA64D-NEXT:    move $a4, $s3
-; LA64D-NEXT:    move $a5, $s3
 ; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
 ; LA64D-NEXT:    fld.d $fa0, $sp, 16
 ; LA64D-NEXT:    beqz $a0, .LBB4_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64D-NEXT:    addi.d $sp, $sp, 80
+; LA64D-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64D-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64D-NEXT:    addi.d $sp, $sp, 48
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, double 1.0 acquire, align 4
   ret double %v
@@ -394,95 +367,68 @@ define double @double_fadd_acquire(ptr %p) nounwind {
 define double @double_fsub_acquire(ptr %p) nounwind {
 ; LA64F-LABEL: double_fsub_acquire:
 ; LA64F:       # %bb.0:
-; LA64F-NEXT:    addi.d $sp, $sp, -80
-; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
+; LA64F-NEXT:    addi.d $sp, $sp, -48
+; LA64F-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64F-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64F-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
 ; LA64F-NEXT:    move $fp, $a0
-; LA64F-NEXT:    ld.d $s5, $a0, 0
-; LA64F-NEXT:    lu52i.d $s0, $zero, -1025
-; LA64F-NEXT:    ori $s1, $zero, 8
-; LA64F-NEXT:    addi.d $s2, $sp, 8
-; LA64F-NEXT:    addi.d $s3, $sp, 0
-; LA64F-NEXT:    ori $s4, $zero, 2
+; LA64F-NEXT:    ld.d $s0, $a0, 0
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64F-NEXT:    move $a0, $s5
-; LA64F-NEXT:    move $a1, $s0
+; LA64F-NEXT:    lu52i.d $a1, $zero, -1025
+; LA64F-NEXT:    move $a0, $s0
 ; LA64F-NEXT:    bl %plt(__adddf3)
-; LA64F-NEXT:    st.d $s5, $sp, 8
-; LA64F-NEXT:    st.d $a0, $sp, 0
-; LA64F-NEXT:    move $a0, $s1
+; LA64F-NEXT:    st.d $s0, $sp, 16
+; LA64F-NEXT:    st.d $a0, $sp, 8
+; LA64F-NEXT:    ori $a0, $zero, 8
+; LA64F-NEXT:    addi.d $a2, $sp, 16
+; LA64F-NEXT:    addi.d $a3, $sp, 8
+; LA64F-NEXT:    ori $a4, $zero, 2
+; LA64F-NEXT:    ori $a5, $zero, 2
 ; LA64F-NEXT:    move $a1, $fp
-; LA64F-NEXT:    move $a2, $s2
-; LA64F-NEXT:    move $a3, $s3
-; LA64F-NEXT:    move $a4, $s4
-; LA64F-NEXT:    move $a5, $s4
 ; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
-; LA64F-NEXT:    ld.d $s5, $sp, 8
+; LA64F-NEXT:    ld.d $s0, $sp, 16
 ; LA64F-NEXT:    beqz $a0, .LBB5_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64F-NEXT:    move $a0, $s5
-; LA64F-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64F-NEXT:    addi.d $sp, $sp, 80
+; LA64F-NEXT:    move $a0, $s0
+; LA64F-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64F-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64F-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64F-NEXT:    addi.d $sp, $sp, 48
 ; LA64F-NEXT:    ret
 ;
 ; LA64D-LABEL: double_fsub_acquire:
 ; LA64D:       # %bb.0:
-; LA64D-NEXT:    addi.d $sp, $sp, -80
-; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
+; LA64D-NEXT:    addi.d $sp, $sp, -48
+; LA64D-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64D-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
 ; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
 ; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
 ; LA64D-NEXT:    fld.d $fs0, $a0, 0
-; LA64D-NEXT:    ori $s0, $zero, 8
-; LA64D-NEXT:    addi.d $s1, $sp, 16
-; LA64D-NEXT:    addi.d $s2, $sp, 8
-; LA64D-NEXT:    ori $s3, $zero, 2
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64D-NEXT:    fadd.d $fa1, $fa0, $fs0
 ; LA64D-NEXT:    fst.d $fa0, $sp, 16
 ; LA64D-NEXT:    fst.d $fa1, $sp, 8
-; LA64D-NEXT:    move $a0, $s0
+; LA64D-NEXT:    ori $a0, $zero, 8
+; LA64D-NEXT:    addi.d $a2, $sp, 16
+; LA64D-NEXT:    addi.d $a3, $sp, 8
+; LA64D-NEXT:    ori $a4, $zero, 2
+; LA64D-NEXT:    ori $a5, $zero, 2
 ; LA64D-NEXT:    move $a1, $fp
-; LA64D-NEXT:    move $a2, $s1
-; LA64D-NEXT:    move $a3, $s2
-; LA64D-NEXT:    move $a4, $s3
-; LA64D-NEXT:    move $a5, $s3
 ; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
 ; LA64D-NEXT:    fld.d $fa0, $sp, 16
 ; LA64D-NEXT:    beqz $a0, .LBB5_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64D-NEXT:    addi.d $sp, $sp, 80
+; LA64D-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64D-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64D-NEXT:    addi.d $sp, $sp, 48
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, double 1.0 acquire, align 4
   ret double %v
@@ -491,71 +437,48 @@ define double @double_fsub_acquire(ptr %p) nounwind {
 define double @double_fmin_acquire(ptr %p) nounwind {
 ; LA64F-LABEL: double_fmin_acquire:
 ; LA64F:       # %bb.0:
-; LA64F-NEXT:    addi.d $sp, $sp, -80
-; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
-; LA64F-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
+; LA64F-NEXT:    addi.d $sp, $sp, -48
+; LA64F-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64F-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64F-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
 ; LA64F-NEXT:    move $fp, $a0
-; LA64F-NEXT:    ld.d $s5, $a0, 0
-; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
-; LA64F-NEXT:    ori $s1, $zero, 8
-; LA64F-NEXT:    addi.d $s2, $sp, 8
-; LA64F-NEXT:    addi.d $s3, $sp, 0
-; LA64F-NEXT:    ori $s4, $zero, 2
+; LA64F-NEXT:    ld.d $s0, $a0, 0
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB6_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64F-NEXT:    move $a0, $s5
-; LA64F-NEXT:    move $a1, $s0
+; LA64F-NEXT:    lu52i.d $a1, $zero, 1023
+; LA64F-NEXT:    move $a0, $s0
 ; LA64F-NEXT:    bl %plt(fmin)
-; LA64F-NEXT:    st.d $s5, $sp, 8
-; LA64F-NEXT:    st.d $a0, $sp, 0
-; LA64F-NEXT:    move $a0, $s1
+; LA64F-NEXT:    st.d $s0, $sp, 16
+; LA64F-NEXT:    st.d $a0, $sp, 8
+; LA64F-NEXT:    ori $a0, $zero, 8
+; LA64F-NEXT:    addi.d $a2, $sp, 16
+; LA64F-NEXT:    addi.d $a3, $sp, 8
+; LA64F-NEXT:    ori $a4, $zero, 2
+; LA64F-NEXT:    ori $a5, $zero, 2
 ; LA64F-NEXT:    move $a1, $fp
-; LA64F-NEXT:    move $a2, $s2
-; LA64F-NEXT:    move $a3, $s3
-; LA64F-NEXT:    move $a4, $s4
-; LA64F-NEXT:    move $a5, $s4
 ; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
-; LA64F-NEXT:    ld.d $s5, $sp, 8
+; LA64F-NEXT:    ld.d $s0, $sp, 16
 ; LA64F-NEXT:    beqz $a0, .LBB6_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64F-NEXT:    move $a0, $s5
-; LA64F-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64F-NEXT:    addi.d $sp, $sp, 80
+; LA64F-NEXT:    move $a0, $s0
+; LA64F-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64F-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64F-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64F-NEXT:    addi.d $sp, $sp, 48
 ; LA64F-NEXT:    ret
 ;
 ; LA64D-LABEL: double_fmin_acquire:
 ; LA64D:       # %bb.0:
-; LA64D-NEXT:    addi.d $sp, $sp, -80
-; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
+; LA64D-NEXT:    addi.d $sp, $sp, -48
+; LA64D-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64D-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
 ; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    addi.d $a0, $zero, 1
 ; LA64D-NEXT:    movgr2fr.d $fa1, $a0
 ; LA64D-NEXT:    ffint.d.l $fs0, $fa1
-; LA64D-NEXT:    ori $s0, $zero, 8
-; LA64D-NEXT:    addi.d $s1, $sp, 16
-; LA64D-NEXT:    addi.d $s2, $sp, 8
-; LA64D-NEXT:    ori $s3, $zero, 2
 ; LA64D-NEXT:    .p2align...
[truncated]

@SixWeining
Copy link
Contributor

About the title: lu12i.w but not lu12i.d

@heiher heiher changed the title [LoongArch] Set isAsCheapAsAMove on LU{12,32,52}I.D/ADDI.D and {X}ORI instructions [LoongArch] Set isAsCheapAsAMove on LU{12,32,52}I/ADDI.D and {X}ORI instructions Jun 7, 2024
@SixWeining
Copy link
Contributor

Seems lu32i.d is 4x slower than move (or).

@heiher
Copy link
Member Author

heiher commented Jun 7, 2024

About the title: lu12i.w but not lu12i.d

Good catch! It seems that removing the suffix of LUXX will not make it unclear. :)

@heiher
Copy link
Member Author

heiher commented Jun 11, 2024

Seems lu32i.d is 4x slower than move (or).

Real tests do not show lu32i.d to be slower than move (or) when they have the same data dependencies. However, I would still remove the lu{12,32,52}i instructions, as they add more instructions to the loop.

@heiher heiher changed the title [LoongArch] Set isAsCheapAsAMove on LU{12,32,52}I/ADDI.D and {X}ORI instructions [LoongArch] Set isAsCheapAsAMove on ADDI.D and {X}ORI instructions Jun 11, 2024
@heiher heiher merged commit 2b6c234 into llvm:main Jun 12, 2024
7 checks passed
@heiher heiher deleted the ascheapasamove branch June 12, 2024 14:02
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants