[ARM] Allow spilling FPSCR for MVE adc/sbc intrinsics (#115174)

ostannard · web-flow · commit 9f02950a1589 · 2024-11-07T11:23:49.000Z
The MVE VADC and VSBC instructions read and write a carry bit in FPSCR,
which is exposed through the intrinsics. This makes it possible to write
code which has the FPSCR live across a function call, or which uses the
same value twice, so it needs to be possible to spill and reload it.

There is a missed optimisation in one of the test cases, where we reload
the FPSCR from the stack despite it still being live, I've not found a
simple way to prevent the register allocator from doing this.
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1163,6 +1163,13 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
             .addImm(0)
             .addMemOperand(MMO)
             .add(predOps(ARMCC::AL));
+      } else if (ARM::cl_FPSCR_NZCVRegClass.hasSubClassEq(RC)) {
+        BuildMI(MBB, I, DebugLoc(), get(ARM::VSTR_FPSCR_NZCVQC_off))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else
         llvm_unreachable("Unknown reg class!");
       break;
@@ -1326,6 +1333,7 @@ Register ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   case ARM::VSTRD:
   case ARM::VSTRS:
   case ARM::VSTR_P0_off:
+  case ARM::VSTR_FPSCR_NZCVQC_off:
   case ARM::MVE_VSTRWU32:
     if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
         MI.getOperand(2).getImm() == 0) {
@@ -1417,6 +1425,12 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
           .addImm(0)
           .addMemOperand(MMO)
           .add(predOps(ARMCC::AL));
+    } else if (ARM::cl_FPSCR_NZCVRegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(ARM::VLDR_FPSCR_NZCVQC_off), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
     } else
       llvm_unreachable("Unknown reg class!");
     break;
@@ -1577,6 +1591,7 @@ Register ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::VLDR_P0_off:
+  case ARM::VLDR_FPSCR_NZCVQC_off:
   case ARM::MVE_VLDRWU32:
     if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
         MI.getOperand(2).getImm() == 0) {
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -2894,9 +2894,8 @@ multiclass vfp_vstrldr_sysreg<bit opc, bits<4> SysReg, string sysreg,
  }
 }
 
-let Defs = [FPSCR] in {
+let Uses = [FPSCR] in {
   defm VSTR_FPSCR          : vfp_vstrldr_sysreg<0b0,0b0001, "fpscr">;
-  defm VSTR_FPSCR_NZCVQC   : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc">;
 
   let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
     defm VSTR_FPCXTNS      : vfp_vstrldr_sysreg<0b0,0b1110, "fpcxtns">;
@@ -2918,12 +2917,18 @@ let Predicates = [HasV8_1MMainline, HasMVEInt] in {
                                                 (outs VCCR:$P0), (ins)>;
 }
 
-let Uses = [FPSCR] in {
+let Defs = [FPSCR] in {
   defm VLDR_FPSCR          : vfp_vstrldr_sysreg<0b1,0b0001, "fpscr">;
-  defm VLDR_FPSCR_NZCVQC   : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc">;
 
   let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
     defm VLDR_FPCXTNS      : vfp_vstrldr_sysreg<0b1,0b1110, "fpcxtns">;
     defm VLDR_FPCXTS       : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">;
   }
 }
+
+defm VSTR_FPSCR_NZCVQC   : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc",
+                                              (outs), (ins cl_FPSCR_NZCV:$fpscr)>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+defm VLDR_FPSCR_NZCVQC   : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc",
+                                              (outs cl_FPSCR_NZCV:$fpscr), (ins)>;
+}
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -6686,6 +6686,13 @@ static unsigned FixedRegForVSTRVLDR_SYSREG(unsigned Opcode) {
   case ARM::VLDR_P0_pre:
   case ARM::VLDR_P0_post:
     return ARM::P0;
+  case ARM::VSTR_FPSCR_NZCVQC_off:
+  case ARM::VSTR_FPSCR_NZCVQC_pre:
+  case ARM::VSTR_FPSCR_NZCVQC_post:
+  case ARM::VLDR_FPSCR_NZCVQC_off:
+  case ARM::VLDR_FPSCR_NZCVQC_pre:
+  case ARM::VLDR_FPSCR_NZCVQC_post:
+    return ARM::FPSCR;
   default:
     return 0;
   }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple thumbv8.1m.main-arm-none-eabihf -mattr=+mve | FileCheck %s
+
+declare void @use_int32x4_t(<4 x i32>)
+
+; A 256-bit addition, with the two halves of the result passed to function
+; calls to spill the carry bit out of FPSCR.
+define void @add_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
+; CHECK-LABEL: add_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    vadci.i32 q0, q0, q2
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    bl use_int32x4_t
+; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vadc.i32 q0, q5, q4
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop.w {r7, lr}
+; CHECK-NEXT:    b use_int32x4_t
+entry:
+  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+  %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+  %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+  tail call void @use_int32x4_t(<4 x i32> %result_low)
+  %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+  %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+  tail call void @use_int32x4_t(<4 x i32> %result_high)
+  ret void
+}
+
+; A 256-bit subtraction, with the two halves of the result passed to function
+; calls to spill the carry bit out of FPSCR.
+define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
+; CHECK-LABEL: sub_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    vsbci.i32 q0, q0, q2
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    bl use_int32x4_t
+; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vsbc.i32 q0, q5, q4
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop.w {r7, lr}
+; CHECK-NEXT:    b use_int32x4_t
+entry:
+  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+  %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+  %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+  tail call void @use_int32x4_t(<4 x i32> %result_low)
+  %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+  %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+  tail call void @use_int32x4_t(<4 x i32> %result_high)
+  ret void
+}
+
+; The carry-out of the first VADC intrinsic call is used by two other VADCs,
+; both of which will modify FPSCR, so it must be spilled and reloaded.
+; Missed optimisation: the first VLDR isn't needed, because the carry bit is
+; already in FPSCR.
+define <4 x i32> @multiple_uses_of_carry_bit(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high, <4 x i32> %a_high_2, <4 x i32> %b_high_2) {
+; CHECK-LABEL: multiple_uses_of_carry_bit:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    vadci.i32 q0, q0, q2
+; CHECK-NEXT:    add r0, sp, #24
+; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vadc.i32 q1, q1, q3
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    add r0, sp, #8
+; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadc.i32 q1, q2, q1
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+entry:
+  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+  %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
+  %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
+  %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
+  %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
+  %checksum_1 = xor <4 x i32> %result_low, %result_high
+  %adc_high_2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high_2, <4 x i32> %b_high_2, i32 %carry)
+  %result_high_2 = extractvalue { <4 x i32>, i32 } %adc_high_2, 0
+  %checksum_2 = xor <4 x i32> %checksum_1, %result_high_2
+  ret <4 x i32> %checksum_2
+}

Original file line number	Diff line number	Diff line change
`@@ -2894,9 +2894,8 @@ multiclass vfp_vstrldr_sysreg<bit opc, bits<4> SysReg, string sysreg,`
`2894`	`2894`	`}`
`2895`	`2895`	`}`
`2896`	`2896`
`2897`		`-let Defs = [FPSCR] in {`
	`2897`	`+let Uses = [FPSCR] in {`
`2898`	`2898`	`defm VSTR_FPSCR : vfp_vstrldr_sysreg<0b0,0b0001, "fpscr">;`
`2899`		`- defm VSTR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc">;`
`2900`	`2899`
`2901`	`2900`	`let Predicates = [HasV8_1MMainline, Has8MSecExt] in {`
`2902`	`2901`	`defm VSTR_FPCXTNS : vfp_vstrldr_sysreg<0b0,0b1110, "fpcxtns">;`
`@@ -2918,12 +2917,18 @@ let Predicates = [HasV8_1MMainline, HasMVEInt] in {`
`2918`	`2917`	`(outs VCCR:$P0), (ins)>;`
`2919`	`2918`	`}`
`2920`	`2919`
`2921`		`-let Uses = [FPSCR] in {`
	`2920`	`+let Defs = [FPSCR] in {`
`2922`	`2921`	`defm VLDR_FPSCR : vfp_vstrldr_sysreg<0b1,0b0001, "fpscr">;`
`2923`		`- defm VLDR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc">;`
`2924`	`2922`
`2925`	`2923`	`let Predicates = [HasV8_1MMainline, Has8MSecExt] in {`
`2926`	`2924`	`defm VLDR_FPCXTNS : vfp_vstrldr_sysreg<0b1,0b1110, "fpcxtns">;`
`2927`	`2925`	`defm VLDR_FPCXTS : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">;`
`2928`	`2926`	`}`
`2929`	`2927`	`}`
	`2928`	`+`
	`2929`	`+defm VSTR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc",`
	`2930`	`+ (outs), (ins cl_FPSCR_NZCV:$fpscr)>;`
	`2931`	`+let canFoldAsLoad = 1, isReMaterializable = 1 in {`
	`2932`	`+defm VLDR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc",`
	`2933`	`+ (outs cl_FPSCR_NZCV:$fpscr), (ins)>;`
	`2934`	`+}`