Skip to content

Commit 9f02950

Browse files
authored
[ARM] Allow spilling FPSCR for MVE adc/sbc intrinsics (#115174)
The MVE VADC and VSBC instructions read and write a carry bit in FPSCR, which is exposed through the intrinsics. This makes it possible to write code which has the FPSCR live across a function call, or which uses the same value twice, so it needs to be possible to spill and reload it. There is a missed optimisation in one of the test cases, where we reload the FPSCR from the stack despite it still being live, I've not found a simple way to prevent the register allocator from doing this.
1 parent dd98ae3 commit 9f02950

File tree

4 files changed

+137
-4
lines changed

4 files changed

+137
-4
lines changed

llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,13 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
11631163
.addImm(0)
11641164
.addMemOperand(MMO)
11651165
.add(predOps(ARMCC::AL));
1166+
} else if (ARM::cl_FPSCR_NZCVRegClass.hasSubClassEq(RC)) {
1167+
BuildMI(MBB, I, DebugLoc(), get(ARM::VSTR_FPSCR_NZCVQC_off))
1168+
.addReg(SrcReg, getKillRegState(isKill))
1169+
.addFrameIndex(FI)
1170+
.addImm(0)
1171+
.addMemOperand(MMO)
1172+
.add(predOps(ARMCC::AL));
11661173
} else
11671174
llvm_unreachable("Unknown reg class!");
11681175
break;
@@ -1326,6 +1333,7 @@ Register ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
13261333
case ARM::VSTRD:
13271334
case ARM::VSTRS:
13281335
case ARM::VSTR_P0_off:
1336+
case ARM::VSTR_FPSCR_NZCVQC_off:
13291337
case ARM::MVE_VSTRWU32:
13301338
if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
13311339
MI.getOperand(2).getImm() == 0) {
@@ -1417,6 +1425,12 @@ void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
14171425
.addImm(0)
14181426
.addMemOperand(MMO)
14191427
.add(predOps(ARMCC::AL));
1428+
} else if (ARM::cl_FPSCR_NZCVRegClass.hasSubClassEq(RC)) {
1429+
BuildMI(MBB, I, DL, get(ARM::VLDR_FPSCR_NZCVQC_off), DestReg)
1430+
.addFrameIndex(FI)
1431+
.addImm(0)
1432+
.addMemOperand(MMO)
1433+
.add(predOps(ARMCC::AL));
14201434
} else
14211435
llvm_unreachable("Unknown reg class!");
14221436
break;
@@ -1577,6 +1591,7 @@ Register ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
15771591
case ARM::VLDRD:
15781592
case ARM::VLDRS:
15791593
case ARM::VLDR_P0_off:
1594+
case ARM::VLDR_FPSCR_NZCVQC_off:
15801595
case ARM::MVE_VLDRWU32:
15811596
if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
15821597
MI.getOperand(2).getImm() == 0) {

llvm/lib/Target/ARM/ARMInstrVFP.td

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2894,9 +2894,8 @@ multiclass vfp_vstrldr_sysreg<bit opc, bits<4> SysReg, string sysreg,
28942894
}
28952895
}
28962896

2897-
let Defs = [FPSCR] in {
2897+
let Uses = [FPSCR] in {
28982898
defm VSTR_FPSCR : vfp_vstrldr_sysreg<0b0,0b0001, "fpscr">;
2899-
defm VSTR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc">;
29002899

29012900
let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
29022901
defm VSTR_FPCXTNS : vfp_vstrldr_sysreg<0b0,0b1110, "fpcxtns">;
@@ -2918,12 +2917,18 @@ let Predicates = [HasV8_1MMainline, HasMVEInt] in {
29182917
(outs VCCR:$P0), (ins)>;
29192918
}
29202919

2921-
let Uses = [FPSCR] in {
2920+
let Defs = [FPSCR] in {
29222921
defm VLDR_FPSCR : vfp_vstrldr_sysreg<0b1,0b0001, "fpscr">;
2923-
defm VLDR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc">;
29242922

29252923
let Predicates = [HasV8_1MMainline, Has8MSecExt] in {
29262924
defm VLDR_FPCXTNS : vfp_vstrldr_sysreg<0b1,0b1110, "fpcxtns">;
29272925
defm VLDR_FPCXTS : vfp_vstrldr_sysreg<0b1,0b1111, "fpcxts">;
29282926
}
29292927
}
2928+
2929+
defm VSTR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b0,0b0010, "fpscr_nzcvqc",
2930+
(outs), (ins cl_FPSCR_NZCV:$fpscr)>;
2931+
let canFoldAsLoad = 1, isReMaterializable = 1 in {
2932+
defm VLDR_FPSCR_NZCVQC : vfp_vstrldr_sysreg<0b1,0b0010, "fpscr_nzcvqc",
2933+
(outs cl_FPSCR_NZCV:$fpscr), (ins)>;
2934+
}

llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6686,6 +6686,13 @@ static unsigned FixedRegForVSTRVLDR_SYSREG(unsigned Opcode) {
66866686
case ARM::VLDR_P0_pre:
66876687
case ARM::VLDR_P0_post:
66886688
return ARM::P0;
6689+
case ARM::VSTR_FPSCR_NZCVQC_off:
6690+
case ARM::VSTR_FPSCR_NZCVQC_pre:
6691+
case ARM::VSTR_FPSCR_NZCVQC_post:
6692+
case ARM::VLDR_FPSCR_NZCVQC_off:
6693+
case ARM::VLDR_FPSCR_NZCVQC_pre:
6694+
case ARM::VLDR_FPSCR_NZCVQC_post:
6695+
return ARM::FPSCR;
66896696
default:
66906697
return 0;
66916698
}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple thumbv8.1m.main-arm-none-eabihf -mattr=+mve | FileCheck %s
3+
4+
declare void @use_int32x4_t(<4 x i32>)
5+
6+
; A 256-bit addition, with the two halves of the result passed to function
7+
; calls to spill the carry bit out of FPSCR.
8+
define void @add_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
9+
; CHECK-LABEL: add_256:
10+
; CHECK: @ %bb.0: @ %entry
11+
; CHECK-NEXT: .save {r7, lr}
12+
; CHECK-NEXT: push {r7, lr}
13+
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
14+
; CHECK-NEXT: vpush {d8, d9, d10, d11}
15+
; CHECK-NEXT: .pad #8
16+
; CHECK-NEXT: sub sp, #8
17+
; CHECK-NEXT: vadci.i32 q0, q0, q2
18+
; CHECK-NEXT: vmov q4, q3
19+
; CHECK-NEXT: vmov q5, q1
20+
; CHECK-NEXT: vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
21+
; CHECK-NEXT: bl use_int32x4_t
22+
; CHECK-NEXT: vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
23+
; CHECK-NEXT: vadc.i32 q0, q5, q4
24+
; CHECK-NEXT: add sp, #8
25+
; CHECK-NEXT: vpop {d8, d9, d10, d11}
26+
; CHECK-NEXT: pop.w {r7, lr}
27+
; CHECK-NEXT: b use_int32x4_t
28+
entry:
29+
%adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
30+
%carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
31+
%result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
32+
tail call void @use_int32x4_t(<4 x i32> %result_low)
33+
%adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
34+
%result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
35+
tail call void @use_int32x4_t(<4 x i32> %result_high)
36+
ret void
37+
}
38+
39+
; A 256-bit subtraction, with the two halves of the result passed to function
40+
; calls to spill the carry bit out of FPSCR.
41+
define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
42+
; CHECK-LABEL: sub_256:
43+
; CHECK: @ %bb.0: @ %entry
44+
; CHECK-NEXT: .save {r7, lr}
45+
; CHECK-NEXT: push {r7, lr}
46+
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
47+
; CHECK-NEXT: vpush {d8, d9, d10, d11}
48+
; CHECK-NEXT: .pad #8
49+
; CHECK-NEXT: sub sp, #8
50+
; CHECK-NEXT: vsbci.i32 q0, q0, q2
51+
; CHECK-NEXT: vmov q4, q3
52+
; CHECK-NEXT: vmov q5, q1
53+
; CHECK-NEXT: vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
54+
; CHECK-NEXT: bl use_int32x4_t
55+
; CHECK-NEXT: vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
56+
; CHECK-NEXT: vsbc.i32 q0, q5, q4
57+
; CHECK-NEXT: add sp, #8
58+
; CHECK-NEXT: vpop {d8, d9, d10, d11}
59+
; CHECK-NEXT: pop.w {r7, lr}
60+
; CHECK-NEXT: b use_int32x4_t
61+
entry:
62+
%adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
63+
%carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
64+
%result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
65+
tail call void @use_int32x4_t(<4 x i32> %result_low)
66+
%adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
67+
%result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
68+
tail call void @use_int32x4_t(<4 x i32> %result_high)
69+
ret void
70+
}
71+
72+
; The carry-out of the first VADC intrinsic call is used by two other VADCs,
73+
; both of which will modify FPSCR, so it must be spilled and reloaded.
74+
; Missed optimisation: the first VLDR isn't needed, because the carry bit is
75+
; already in FPSCR.
76+
define <4 x i32> @multiple_uses_of_carry_bit(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high, <4 x i32> %a_high_2, <4 x i32> %b_high_2) {
77+
; CHECK-LABEL: multiple_uses_of_carry_bit:
78+
; CHECK: @ %bb.0: @ %entry
79+
; CHECK-NEXT: .pad #8
80+
; CHECK-NEXT: sub sp, #8
81+
; CHECK-NEXT: vadci.i32 q0, q0, q2
82+
; CHECK-NEXT: add r0, sp, #24
83+
; CHECK-NEXT: vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
84+
; CHECK-NEXT: vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
85+
; CHECK-NEXT: vadc.i32 q1, q1, q3
86+
; CHECK-NEXT: veor q0, q0, q1
87+
; CHECK-NEXT: vldrw.u32 q1, [r0]
88+
; CHECK-NEXT: add r0, sp, #8
89+
; CHECK-NEXT: vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
90+
; CHECK-NEXT: vldrw.u32 q2, [r0]
91+
; CHECK-NEXT: vadc.i32 q1, q2, q1
92+
; CHECK-NEXT: veor q0, q0, q1
93+
; CHECK-NEXT: add sp, #8
94+
; CHECK-NEXT: bx lr
95+
entry:
96+
%adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
97+
%carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
98+
%result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
99+
%adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
100+
%result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
101+
%checksum_1 = xor <4 x i32> %result_low, %result_high
102+
%adc_high_2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high_2, <4 x i32> %b_high_2, i32 %carry)
103+
%result_high_2 = extractvalue { <4 x i32>, i32 } %adc_high_2, 0
104+
%checksum_2 = xor <4 x i32> %checksum_1, %result_high_2
105+
ret <4 x i32> %checksum_2
106+
}

0 commit comments

Comments
 (0)