Skip to content

Commit b577d4d

Browse files
committed
Relax restrictions for COPY and refactor
The restrictions for the COPY nodes were too strict. I couldn't come up with any tests or cases where a COPY on its own would result in any issues. Also refactored the code a bit so that we don't need to do any analysis on COPY nodes when the algorithm isn't trying to match a 'smstart/smstop' sequence.
1 parent ebdf9f7 commit b577d4d

File tree

3 files changed

+68
-55
lines changed

3 files changed

+68
-55
lines changed

llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp

Lines changed: 66 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
#include "llvm/ADT/SmallVector.h"
1818
#include "llvm/CodeGen/MachineBasicBlock.h"
1919
#include "llvm/CodeGen/MachineFunctionPass.h"
20+
#include "llvm/CodeGen/MachineRegisterInfo.h"
21+
#include "llvm/CodeGen/TargetRegisterInfo.h"
2022

2123
using namespace llvm;
2224

@@ -108,8 +110,30 @@ static bool ChangesStreamingMode(const MachineInstr *MI) {
108110
MI->getOperand(0).getImm() == AArch64SVCR::SVCRSMZA;
109111
}
110112

113+
static bool isSVERegOp(const TargetRegisterInfo &TRI,
114+
const MachineRegisterInfo &MRI,
115+
const MachineOperand &MO) {
116+
if (!MO.isReg())
117+
return false;
118+
119+
Register R = MO.getReg();
120+
if (R.isPhysical())
121+
return llvm::any_of(TRI.subregs_inclusive(R), [](const MCPhysReg &SR) {
122+
return AArch64::ZPRRegClass.contains(SR) ||
123+
AArch64::PPRRegClass.contains(SR);
124+
});
125+
126+
const TargetRegisterClass *RC = MRI.getRegClass(R);
127+
return TRI.getCommonSubClass(&AArch64::ZPRRegClass, RC) ||
128+
TRI.getCommonSubClass(&AArch64::PPRRegClass, RC);
129+
}
130+
111131
bool SMEPeepholeOpt::optimizeStartStopPairs(MachineBasicBlock &MBB,
112132
bool &HasRemainingSMChange) const {
133+
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
134+
const TargetRegisterInfo &TRI =
135+
*MBB.getParent()->getSubtarget().getRegisterInfo();
136+
113137
SmallVector<MachineInstr *, 4> ToBeRemoved;
114138

115139
bool Changed = false;
@@ -129,33 +153,6 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(MachineBasicBlock &MBB,
129153
// tracking.
130154
for (MachineInstr &MI : make_early_inc_range(MBB)) {
131155
switch (MI.getOpcode()) {
132-
default:
133-
Reset();
134-
break;
135-
case AArch64::COPY: {
136-
// Permit copies of 32 and 64-bit registers.
137-
if (!MI.getOperand(1).isReg()) {
138-
Reset();
139-
break;
140-
}
141-
Register Reg = MI.getOperand(1).getReg();
142-
if (!AArch64::GPR32RegClass.contains(Reg) &&
143-
!AArch64::GPR64RegClass.contains(Reg))
144-
Reset();
145-
break;
146-
}
147-
case AArch64::ADJCALLSTACKDOWN:
148-
case AArch64::ADJCALLSTACKUP:
149-
case AArch64::ANDXri:
150-
case AArch64::ADDXri:
151-
// We permit these as they don't generate SVE/NEON instructions.
152-
break;
153-
case AArch64::VGRestorePseudo:
154-
case AArch64::VGSavePseudo:
155-
// When the smstart/smstop are removed, we should also remove
156-
// the pseudos that save/restore the VG value for CFI info.
157-
ToBeRemoved.push_back(&MI);
158-
break;
159156
case AArch64::MSRpstatesvcrImm1:
160157
case AArch64::MSRpstatePseudo: {
161158
if (!Prev)
@@ -174,8 +171,50 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(MachineBasicBlock &MBB,
174171
Reset();
175172
Prev = &MI;
176173
}
174+
continue;
175+
}
176+
default:
177+
if (!Prev)
178+
// Avoid doing expensive checks when Prev is nullptr.
179+
continue;
177180
break;
178181
}
182+
183+
// Test if the instructions in between the start/stop sequence are agnostic
184+
// of streaming mode. If not, the algorithm should reset.
185+
switch (MI.getOpcode()) {
186+
default:
187+
Reset();
188+
break;
189+
case AArch64::COALESCER_BARRIER_FPR16:
190+
case AArch64::COALESCER_BARRIER_FPR32:
191+
case AArch64::COALESCER_BARRIER_FPR64:
192+
case AArch64::COALESCER_BARRIER_FPR128:
193+
case AArch64::COPY:
194+
// These instructions should be safe when executed on their own, but
195+
// the code remains conservative when SVE registers are used. There may
196+
// exist subtle cases where executing a COPY in a different mode results
197+
// in different behaviour, even if we can't yet come up with any
198+
// concrete example/test-case.
199+
if (isSVERegOp(TRI, MRI, MI.getOperand(0)) ||
200+
isSVERegOp(TRI, MRI, MI.getOperand(1)))
201+
Reset();
202+
break;
203+
case AArch64::ADJCALLSTACKDOWN:
204+
case AArch64::ADJCALLSTACKUP:
205+
case AArch64::ANDXri:
206+
case AArch64::ADDXri:
207+
// We permit these as they don't generate SVE/NEON instructions.
208+
break;
209+
case AArch64::VGRestorePseudo:
210+
case AArch64::VGSavePseudo:
211+
// When the smstart/smstop are removed, we should also remove
212+
// the pseudos that save/restore the VG value for CFI info.
213+
ToBeRemoved.push_back(&MI);
214+
break;
215+
case AArch64::MSRpstatesvcrImm1:
216+
case AArch64::MSRpstatePseudo:
217+
llvm_unreachable("Should have been handled");
179218
}
180219
}
181220

llvm/test/CodeGen/AArch64/sme-peephole-opts.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,6 @@ define float @test6(float %f) nounwind "aarch64_pstate_sm_enabled" {
229229
; CHECK-NEXT: smstop sm
230230
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
231231
; CHECK-NEXT: bl callee_farg_fret
232-
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
233-
; CHECK-NEXT: smstart sm
234-
; CHECK-NEXT: smstop sm
235-
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
236232
; CHECK-NEXT: bl callee_farg_fret
237233
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
238234
; CHECK-NEXT: smstart sm

llvm/test/CodeGen/AArch64/sme-streaming-body.ll

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -256,31 +256,9 @@ declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"
256256
define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
257257
; CHECK-LABEL: call_to_intrinsic_without_chain:
258258
; CHECK: // %bb.0: // %entry
259-
; CHECK-NEXT: sub sp, sp, #112
260-
; CHECK-NEXT: rdsvl x9, #1
261-
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
262-
; CHECK-NEXT: lsr x9, x9, #3
263-
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
264-
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
265-
; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
266-
; CHECK-NEXT: cntd x9
267-
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
268-
; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
269-
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
270-
; CHECK-NEXT: smstart sm
271-
; CHECK-NEXT: smstop sm
272-
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
259+
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
273260
; CHECK-NEXT: bl cos
274-
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
275-
; CHECK-NEXT: smstart sm
276-
; CHECK-NEXT: smstop sm
277-
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
278-
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
279-
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
280-
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
281-
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
282-
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
283-
; CHECK-NEXT: add sp, sp, #112
261+
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
284262
; CHECK-NEXT: ret
285263
entry:
286264
%0 = call fast double @llvm.cos.f64(double %x)

0 commit comments

Comments
 (0)