Skip to content

Commit 0e6bf52

Browse files
committed
[AArch64][SME] Create new pass to remove COALESCER_BARRIER early.
The purpose of the COALESCER_BARRIER pseudo node is to prevent the register coalescer from coalescing certain COPY instructions around smstart/smstop instructions, so that we spill only the (required) FPR register rather than the encompassing ZPR register. The pseudos are removed in the AArch64ExpandPseudo pass. However, because the node itself is a _use_ of a register, this occassionally leads to redundant spills/fills, because the register allocator thinks the virtual register is actually used before an smstart/smstop instruction, causing it to be filled, at which points it requires immediate spilling again to ensure it stays live over the smstart/smstop instruction. We can avoid that by removing the pseudo nodes right after coalescing, but before register allocation.
1 parent 8c0f52e commit 0e6bf52

11 files changed

+121
-27
lines changed

llvm/lib/Target/AArch64/AArch64.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ FunctionPass *createFalkorMarkStridedAccessesPass();
5454
FunctionPass *createAArch64PointerAuthPass();
5555
FunctionPass *createAArch64BranchTargetsPass();
5656
FunctionPass *createAArch64MIPeepholeOptPass();
57+
FunctionPass *createAArch64PostCoalescerPass();
5758

5859
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
5960

@@ -93,6 +94,7 @@ void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);
9394
void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
9495
void initializeAArch64MIPeepholeOptPass(PassRegistry &);
9596
void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
97+
void initializeAArch64PostCoalescerPass(PassRegistry &);
9698
void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);
9799
void initializeAArch64PostLegalizerLoweringPass(PassRegistry &);
98100
void initializeAArch64PostSelectOptimizePass(PassRegistry &);
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
//===- AArch64PostCoalescerPass.cpp - AArch64 Post Coalescer pass ---------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//===----------------------------------------------------------------------===//
9+
10+
#include "AArch64InstrInfo.h"
11+
#include "AArch64MachineFunctionInfo.h"
12+
#include "llvm/InitializePasses.h"
13+
#include "llvm/CodeGen/LiveIntervals.h"
14+
#include "llvm/CodeGen/MachineRegisterInfo.h"
15+
16+
using namespace llvm;
17+
18+
#define DEBUG_TYPE "aarch64-post-coalescer-pass"
19+
20+
namespace {
21+
22+
struct AArch64PostCoalescer : public MachineFunctionPass {
23+
static char ID;
24+
25+
AArch64PostCoalescer() : MachineFunctionPass(ID) {
26+
initializeAArch64PostCoalescerPass(*PassRegistry::getPassRegistry());
27+
}
28+
29+
LiveIntervals *LIS;
30+
MachineRegisterInfo *MRI;
31+
32+
bool runOnMachineFunction(MachineFunction &MF) override;
33+
34+
StringRef getPassName() const override {
35+
return "AArch64 Post Coalescer pass";
36+
}
37+
38+
void getAnalysisUsage(AnalysisUsage &AU) const override {
39+
AU.setPreservesAll();
40+
AU.addRequired<LiveIntervals>();
41+
MachineFunctionPass::getAnalysisUsage(AU);
42+
}
43+
};
44+
45+
char AArch64PostCoalescer::ID = 0;
46+
47+
} // end anonymous namespace
48+
49+
INITIALIZE_PASS_BEGIN(AArch64PostCoalescer, "aarch64-post-coalescer-pass",
50+
"AArch64 Post Coalescer Pass", false, false)
51+
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
52+
INITIALIZE_PASS_END(AArch64PostCoalescer, "aarch64-post-coalescer-pass",
53+
"AArch64 Post Coalescer Pass", false, false)
54+
55+
bool AArch64PostCoalescer::runOnMachineFunction(MachineFunction &MF) {
56+
if (skipFunction(MF.getFunction()))
57+
return false;
58+
59+
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
60+
if (!FuncInfo->hasStreamingModeChanges())
61+
return false;
62+
63+
MRI = &MF.getRegInfo();
64+
LIS = &getAnalysis<LiveIntervals>();
65+
bool Changed = false;
66+
67+
for (MachineBasicBlock &MBB : MF) {
68+
for (MachineInstr &MI : make_early_inc_range(MBB)) {
69+
switch (MI.getOpcode()) {
70+
default:
71+
break;
72+
case AArch64::COALESCER_BARRIER_FPR16:
73+
case AArch64::COALESCER_BARRIER_FPR32:
74+
case AArch64::COALESCER_BARRIER_FPR64:
75+
case AArch64::COALESCER_BARRIER_FPR128: {
76+
Register Src = MI.getOperand(1).getReg();
77+
Register Dst = MI.getOperand(0).getReg();
78+
if (Src != Dst)
79+
MRI->replaceRegWith(Dst, Src);
80+
81+
// MI must be erased from the basic block before recalculating the live
82+
// interval.
83+
LIS->RemoveMachineInstrFromMaps(MI);
84+
MI.eraseFromParent();
85+
86+
LIS->removeInterval(Src);
87+
LIS->createAndComputeVirtRegInterval(Src);
88+
89+
Changed = true;
90+
break;
91+
}
92+
}
93+
}
94+
}
95+
96+
return Changed;
97+
}
98+
99+
FunctionPass *llvm::createAArch64PostCoalescerPass() {
100+
return new AArch64PostCoalescer();
101+
}

llvm/lib/Target/AArch64/AArch64TargetMachine.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
235235
initializeAArch64O0PreLegalizerCombinerPass(*PR);
236236
initializeAArch64PreLegalizerCombinerPass(*PR);
237237
initializeAArch64PointerAuthPass(*PR);
238+
initializeAArch64PostCoalescerPass(*PR);
238239
initializeAArch64PostLegalizerCombinerPass(*PR);
239240
initializeAArch64PostLegalizerLoweringPass(*PR);
240241
initializeAArch64PostSelectOptimizePass(*PR);
@@ -539,6 +540,7 @@ class AArch64PassConfig : public TargetPassConfig {
539540
void addPreEmitPass() override;
540541
void addPostBBSections() override;
541542
void addPreEmitPass2() override;
543+
bool addRegAssignAndRewriteOptimized() override;
542544

543545
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
544546
};
@@ -876,6 +878,11 @@ void AArch64PassConfig::addPreEmitPass2() {
876878
addPass(createUnpackMachineBundles(nullptr));
877879
}
878880

881+
bool AArch64PassConfig::addRegAssignAndRewriteOptimized() {
882+
addPass(createAArch64PostCoalescerPass());
883+
return TargetPassConfig::addRegAssignAndRewriteOptimized();
884+
}
885+
879886
MachineFunctionInfo *AArch64TargetMachine::createMachineFunctionInfo(
880887
BumpPtrAllocator &Allocator, const Function &F,
881888
const TargetSubtargetInfo *STI) const {

llvm/lib/Target/AArch64/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ add_llvm_target(AArch64CodeGen
7373
AArch64MIPeepholeOpt.cpp
7474
AArch64MCInstLower.cpp
7575
AArch64PointerAuth.cpp
76+
AArch64PostCoalescerPass.cpp
7677
AArch64PromoteConstant.cpp
7778
AArch64PBQPRegAlloc.cpp
7879
AArch64RegisterInfo.cpp

llvm/test/CodeGen/AArch64/O3-pipeline.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@
167167
; CHECK-NEXT: Register Coalescer
168168
; CHECK-NEXT: Rename Disconnected Subregister Components
169169
; CHECK-NEXT: Machine Instruction Scheduler
170+
; CHECK-NEXT: AArch64 Post Coalescer pass
170171
; CHECK-NEXT: Machine Block Frequency Analysis
171172
; CHECK-NEXT: Debug Variable Analysis
172173
; CHECK-NEXT: Live Stack Slot Analysis

llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,9 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
331331
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
332332
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
333333
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
334-
; CHECK-COMMON-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill
334+
; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
335335
; CHECK-COMMON-NEXT: smstop sm
336-
; CHECK-COMMON-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
336+
; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
337337
; CHECK-COMMON-NEXT: bl __addtf3
338338
; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
339339
; CHECK-COMMON-NEXT: smstart sm
@@ -392,9 +392,9 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
392392
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
393393
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
394394
; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
395-
; CHECK-COMMON-NEXT: stp s1, s0, [sp, #8] // 8-byte Folded Spill
395+
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
396396
; CHECK-COMMON-NEXT: smstop sm
397-
; CHECK-COMMON-NEXT: ldp s1, s0, [sp, #8] // 8-byte Folded Reload
397+
; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
398398
; CHECK-COMMON-NEXT: bl fmodf
399399
; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
400400
; CHECK-COMMON-NEXT: smstart sm
@@ -422,9 +422,7 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
422422
; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
423423
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
424424
; CHECK-COMMON-NEXT: bl __arm_sme_state
425-
; CHECK-COMMON-NEXT: ldp s2, s0, [sp, #8] // 8-byte Folded Reload
426425
; CHECK-COMMON-NEXT: and x19, x0, #0x1
427-
; CHECK-COMMON-NEXT: stp s2, s0, [sp, #8] // 8-byte Folded Spill
428426
; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2
429427
; CHECK-COMMON-NEXT: // %bb.1:
430428
; CHECK-COMMON-NEXT: smstop sm

llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,7 +1085,6 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
10851085
; CHECK-NEXT: smstart sm
10861086
; CHECK-NEXT: ptrue p0.s
10871087
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
1088-
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
10891088
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
10901089
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
10911090
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
@@ -1116,7 +1115,6 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
11161115
; CHECK-NEXT: smstart sm
11171116
; CHECK-NEXT: ptrue p0.d
11181117
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
1119-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
11201118
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
11211119
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
11221120
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
@@ -1151,7 +1149,6 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
11511149
; CHECK-NEXT: smstart sm
11521150
; CHECK-NEXT: ptrue p0.b
11531151
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
1154-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
11551152
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
11561153
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
11571154
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
@@ -1183,7 +1180,6 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
11831180
; CHECK-NEXT: smstart sm
11841181
; CHECK-NEXT: ptrue p0.h
11851182
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
1186-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
11871183
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
11881184
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
11891185
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
@@ -1215,7 +1211,6 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
12151211
; CHECK-NEXT: smstart sm
12161212
; CHECK-NEXT: ptrue p0.s
12171213
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
1218-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
12191214
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
12201215
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
12211216
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
@@ -1247,7 +1242,6 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
12471242
; CHECK-NEXT: smstart sm
12481243
; CHECK-NEXT: ptrue p0.d
12491244
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
1250-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
12511245
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
12521246
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
12531247
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
@@ -1311,7 +1305,6 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
13111305
; CHECK-NEXT: smstart sm
13121306
; CHECK-NEXT: ptrue p0.s
13131307
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
1314-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
13151308
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
13161309
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
13171310
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
@@ -1343,7 +1336,6 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
13431336
; CHECK-NEXT: smstart sm
13441337
; CHECK-NEXT: ptrue p0.d
13451338
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
1346-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
13471339
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
13481340
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
13491341
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload

llvm/test/CodeGen/AArch64/sme-streaming-body.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,15 +247,11 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
247247
; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
248248
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
249249
; CHECK-NEXT: smstart sm
250-
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
251-
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
252250
; CHECK-NEXT: smstop sm
253251
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
254252
; CHECK-NEXT: bl cos
255253
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
256254
; CHECK-NEXT: smstart sm
257-
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
258-
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
259255
; CHECK-NEXT: smstop sm
260256
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
261257
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload

llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -469,11 +469,7 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
469469
; CHECK-NEXT: mov x9, x0
470470
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
471471
; CHECK-NEXT: bl __arm_sme_state
472-
; CHECK-NEXT: ldp s4, s0, [sp, #8] // 8-byte Folded Reload
473472
; CHECK-NEXT: and x19, x0, #0x1
474-
; CHECK-NEXT: stp s4, s0, [sp, #8] // 8-byte Folded Spill
475-
; CHECK-NEXT: ldp d4, d0, [sp, #16] // 16-byte Folded Reload
476-
; CHECK-NEXT: stp d4, d0, [sp, #16] // 16-byte Folded Spill
477473
; CHECK-NEXT: tbz w19, #0, .LBB10_2
478474
; CHECK-NEXT: // %bb.1: // %entry
479475
; CHECK-NEXT: smstop sm

llvm/test/CodeGen/AArch64/sme-streaming-interface.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -405,11 +405,11 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
405405
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
406406
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
407407
; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
408-
; CHECK-NEXT: stp s1, s0, [sp, #24] // 8-byte Folded Spill
409-
; CHECK-NEXT: stp d3, d2, [sp, #8] // 16-byte Folded Spill
408+
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
409+
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
410410
; CHECK-NEXT: smstop sm
411-
; CHECK-NEXT: ldp s1, s0, [sp, #24] // 8-byte Folded Reload
412-
; CHECK-NEXT: ldp d3, d2, [sp, #8] // 16-byte Folded Reload
411+
; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
412+
; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload
413413
; CHECK-NEXT: bl bar
414414
; CHECK-NEXT: smstart sm
415415
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload

llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
2222
; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill
2323
; CHECK-NEXT: sub sp, sp, #16
2424
; CHECK-NEXT: addvl sp, sp, #-1
25+
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
2526
; CHECK-NEXT: //APP
2627
; CHECK-NEXT: //NO_APP
27-
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
2828
; CHECK-NEXT: smstop sm
2929
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
3030
; CHECK-NEXT: bl use_f

0 commit comments

Comments
 (0)