Skip to content

Commit 0ddf38c

Browse files
author
Krzysztof Parzyszek
committed
[Hexagon] Improve stack address base reuse for HVX spills
The offset in HVX loads/stores is only 4 bits long, so often an extra register is needed to hold the address. Minimize the number of such registers by "standardizing" the base addresses and reusing preexisting base registers when replacing frame indices.
1 parent 8494122 commit 0ddf38c

File tree

2 files changed

+302
-12
lines changed

2 files changed

+302
-12
lines changed

llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp

Lines changed: 90 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "llvm/ADT/BitVector.h"
2020
#include "llvm/ADT/STLExtras.h"
2121
#include "llvm/CodeGen/LiveIntervals.h"
22+
#include "llvm/CodeGen/LiveRegUnits.h"
2223
#include "llvm/CodeGen/MachineFrameInfo.h"
2324
#include "llvm/CodeGen/MachineFunction.h"
2425
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -30,6 +31,7 @@
3031
#include "llvm/IR/Function.h"
3132
#include "llvm/IR/Type.h"
3233
#include "llvm/MC/MachineLocation.h"
34+
#include "llvm/Support/CommandLine.h"
3335
#include "llvm/Support/Debug.h"
3436
#include "llvm/Support/ErrorHandling.h"
3537
#include "llvm/Support/raw_ostream.h"
@@ -41,6 +43,10 @@
4143

4244
using namespace llvm;
4345

46+
static cl::opt<unsigned> FrameIndexSearchLimit(
47+
"hexagon-frame-index-search-limit", cl::init(32), cl::Hidden,
48+
cl::desc("Limit on instruction search in frame index elimination"));
49+
4450
HexagonRegisterInfo::HexagonRegisterInfo(unsigned HwMode)
4551
: HexagonGenRegisterInfo(Hexagon::R31, 0/*DwarfFlavor*/, 0/*EHFlavor*/,
4652
0/*PC*/, HwMode) {}
@@ -133,7 +139,7 @@ const uint32_t *HexagonRegisterInfo::getCallPreservedMask(
133139

134140

135141
BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
136-
const {
142+
const {
137143
BitVector Reserved(getNumRegs());
138144
Reserved.set(Hexagon::R29);
139145
Reserved.set(Hexagon::R30);
@@ -188,7 +194,6 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
188194
return Reserved;
189195
}
190196

191-
192197
void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
193198
int SPAdj, unsigned FIOp,
194199
RegScavenger *RS) const {
@@ -210,7 +215,6 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
210215
int Offset = HFI.getFrameIndexReference(MF, FI, BP).getFixed();
211216
// Add the offset from the instruction.
212217
int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
213-
bool IsKill = false;
214218

215219
unsigned Opc = MI.getOpcode();
216220
switch (Opc) {
@@ -228,18 +232,92 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
228232
if (!HII.isValidOffset(Opc, RealOffset, this)) {
229233
// If the offset is not valid, calculate the address in a temporary
230234
// register and use it with offset 0.
235+
int InstOffset = 0;
236+
// The actual base register (BP) is typically shared between many
237+
// instructions where frame indices are being replaced. In scalar
238+
// instructions the offset range is large, and the need for an extra
239+
// add instruction is infrequent. Vector loads/stores, however, have
240+
// a much smaller offset range: [-8, 7), or #s4. In those cases it
241+
// makes sense to "standardize" the immediate in the "addi" instruction
242+
// so that multiple loads/stores could be based on it.
243+
bool IsPair = false;
244+
switch (MI.getOpcode()) {
245+
// All of these instructions have the same format: base+#s4.
246+
case Hexagon::PS_vloadrw_ai:
247+
case Hexagon::PS_vloadrw_nt_ai:
248+
case Hexagon::PS_vstorerw_ai:
249+
case Hexagon::PS_vstorerw_nt_ai:
250+
IsPair = true;
251+
LLVM_FALLTHROUGH;
252+
case Hexagon::PS_vloadrv_ai:
253+
case Hexagon::PS_vloadrv_nt_ai:
254+
case Hexagon::PS_vstorerv_ai:
255+
case Hexagon::PS_vstorerv_nt_ai:
256+
case Hexagon::V6_vL32b_ai:
257+
case Hexagon::V6_vS32b_ai: {
258+
unsigned HwLen = HST.getVectorLength();
259+
if (RealOffset % HwLen == 0) {
260+
int VecOffset = RealOffset / HwLen;
261+
// Rewrite the offset as "base + [-8, 7)".
262+
VecOffset += 8;
263+
// Pairs are expanded into two instructions: make sure that both
264+
// can use the same base (i.e. VecOffset+1 is not a different
265+
// multiple of 16 than VecOffset).
266+
if (!IsPair || (VecOffset + 1) % 16 != 0) {
267+
RealOffset = (VecOffset & -16) * HwLen;
268+
InstOffset = (VecOffset % 16 - 8) * HwLen;
269+
}
270+
}
271+
}
272+
}
273+
274+
// Search backwards in the block for "Reg = A2_addi BP, RealOffset".
275+
// This will give us a chance to avoid creating a new register.
276+
Register ReuseBP;
277+
unsigned SearchCount = 0, SearchLimit = FrameIndexSearchLimit;
278+
bool PassedCall = false;
279+
LiveRegUnits Defs(*this), Uses(*this);
280+
281+
for (auto I = std::next(II.getReverse()), E = MB.rend(); I != E; ++I) {
282+
if (SearchCount == SearchLimit)
283+
break;
284+
++SearchCount;
285+
const MachineInstr &BI = *I;
286+
LiveRegUnits::accumulateUsedDefed(BI, Defs, Uses, this);
287+
PassedCall |= BI.isCall();
288+
289+
if (BI.getOpcode() != Hexagon::A2_addi)
290+
continue;
291+
if (BI.getOperand(1).getReg() != BP)
292+
continue;
293+
const auto &Op2 = BI.getOperand(2);
294+
if (!Op2.isImm() || Op2.getImm() != RealOffset)
295+
continue;
296+
297+
Register R = BI.getOperand(0).getReg();
298+
if (R.isPhysical()) {
299+
if (Defs.available(R))
300+
ReuseBP = R;
301+
} else if (R.isVirtual()) {
302+
if (!PassedCall)
303+
ReuseBP = R;
304+
}
305+
break;
306+
}
307+
231308
auto &MRI = MF.getRegInfo();
232-
Register TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
233-
const DebugLoc &DL = MI.getDebugLoc();
234-
BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), TmpR)
235-
.addReg(BP)
236-
.addImm(RealOffset);
237-
BP = TmpR;
238-
RealOffset = 0;
239-
IsKill = true;
309+
if (!ReuseBP) {
310+
ReuseBP = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
311+
const DebugLoc &DL = MI.getDebugLoc();
312+
BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), ReuseBP)
313+
.addReg(BP)
314+
.addImm(RealOffset);
315+
}
316+
BP = ReuseBP;
317+
RealOffset = InstOffset;
240318
}
241319

242-
MI.getOperand(FIOp).ChangeToRegister(BP, false, false, IsKill);
320+
MI.getOperand(FIOp).ChangeToRegister(BP, false, false, false);
243321
MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset);
244322
}
245323

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -march=hexagon < %s | FileCheck %s
3+
target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
4+
target triple = "hexagon"
5+
6+
@g0 = external dso_local global <64 x i32>, align 128
7+
@g1 = external hidden unnamed_addr constant [110 x i8], align 1
8+
@g2 = external hidden unnamed_addr constant [102 x i8], align 1
9+
@g3 = external hidden unnamed_addr constant [110 x i8], align 1
10+
11+
declare dso_local void @f0() #0
12+
13+
declare dso_local void @f1(i8*, ...) #0
14+
15+
; Function Attrs: nounwind readnone
16+
declare <32 x i32> @llvm.hexagon.V6.vandqrt.128B(<128 x i1>, i32) #1
17+
18+
; Function Attrs: nounwind readnone
19+
declare <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32>, i32) #1
20+
21+
; Function Attrs: nounwind readnone
22+
declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) #1
23+
24+
; Function Attrs: nounwind readnone
25+
declare <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32>, i32, i32 immarg) #1
26+
27+
; Function Attrs: nounwind readnone
28+
declare <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32>, <32 x i32>) #1
29+
30+
define dso_local void @f2() #0 {
31+
; CHECK-LABEL: f2:
32+
; CHECK: // %bb.0: // %b0
33+
; CHECK-NEXT: {
34+
; CHECK-NEXT: r1:0 = combine(#2,##16843009)
35+
; CHECK-NEXT: allocframe(r29,#1536):raw
36+
; CHECK-NEXT: }
37+
; CHECK-NEXT: {
38+
; CHECK-NEXT: v1 = vsplat(r1)
39+
; CHECK-NEXT: r17:16 = combine(#-1,#1)
40+
; CHECK-NEXT: r29 = and(r29,#-256)
41+
; CHECK-NEXT: memd(r30+#-8) = r17:16
42+
; CHECK-NEXT: } // 8-byte Folded Spill
43+
; CHECK-NEXT: {
44+
; CHECK-NEXT: v0 = vsplat(r16)
45+
; CHECK-NEXT: r2 = add(r29,#2048)
46+
; CHECK-NEXT: memd(r30+#-16) = r19:18
47+
; CHECK-NEXT: } // 8-byte Folded Spill
48+
; CHECK-NEXT: {
49+
; CHECK-NEXT: q0 = vand(v0,r0)
50+
; CHECK-NEXT: r18 = ##-2147483648
51+
; CHECK-NEXT: vmem(r2+#-7) = v0
52+
; CHECK-NEXT: } // 128-byte Folded Spill
53+
; CHECK-NEXT: {
54+
; CHECK-NEXT: v0 = vand(q0,r17)
55+
; CHECK-NEXT: r0 = ##g1
56+
; CHECK-NEXT: memd(r30+#-24) = r21:20
57+
; CHECK-NEXT: } // 8-byte Folded Spill
58+
; CHECK-NEXT: {
59+
; CHECK-NEXT: r19 = ##g0+128
60+
; CHECK-NEXT: vmem(r2+#-6) = v0
61+
; CHECK-NEXT: }
62+
; CHECK-NEXT: {
63+
; CHECK-NEXT: v3:2.h = vadd(v0.ub,v1.ub)
64+
; CHECK-NEXT: r20 = ##g0
65+
; CHECK-NEXT: vmem(r29+#5) = v1
66+
; CHECK-NEXT: } // 128-byte Folded Spill
67+
; CHECK-NEXT: {
68+
; CHECK-NEXT: vmem(r29+#6) = v2
69+
; CHECK-NEXT: } // 256-byte Folded Spill
70+
; CHECK-NEXT: {
71+
; CHECK-NEXT: v31:30.uw = vrmpy(v3:2.ub,r18.ub,#0)
72+
; CHECK-NEXT: vmem(r29+#7) = v3
73+
; CHECK-NEXT: } // 256-byte Folded Spill
74+
; CHECK-NEXT: {
75+
; CHECK-NEXT: vmem(r19+#0) = v31
76+
; CHECK-NEXT: }
77+
; CHECK-NEXT: {
78+
; CHECK-NEXT: call f1
79+
; CHECK-NEXT: vmem(r20+#0) = v30
80+
; CHECK-NEXT: }
81+
; CHECK-NEXT: {
82+
; CHECK-NEXT: r0 = add(r29,#2048)
83+
; CHECK-NEXT: }
84+
; CHECK-NEXT: {
85+
; CHECK-NEXT: v0 = vmem(r0+#-7)
86+
; CHECK-NEXT: } // 128-byte Folded Reload
87+
; CHECK-NEXT: {
88+
; CHECK-NEXT: v1:0.h = vadd(v0.ub,v0.ub)
89+
; CHECK-NEXT: r0 = ##g2
90+
; CHECK-NEXT: vmem(r29+#2) = v0.new
91+
; CHECK-NEXT: } // 256-byte Folded Spill
92+
; CHECK-NEXT: {
93+
; CHECK-NEXT: vmem(r29+#3) = v1
94+
; CHECK-NEXT: } // 256-byte Folded Spill
95+
; CHECK-NEXT: {
96+
; CHECK-NEXT: v1:0.uw = vrmpy(v1:0.ub,r17.ub,#0)
97+
; CHECK-NEXT: vmem(r19+#0) = v1.new
98+
; CHECK-NEXT: }
99+
; CHECK-NEXT: {
100+
; CHECK-NEXT: call f1
101+
; CHECK-NEXT: vmem(r20+#0) = v0
102+
; CHECK-NEXT: }
103+
; CHECK-NEXT: {
104+
; CHECK-NEXT: r0 = ##2147483647
105+
; CHECK-NEXT: v0 = vmem(r29+#2)
106+
; CHECK-NEXT: } // 256-byte Folded Reload
107+
; CHECK-NEXT: {
108+
; CHECK-NEXT: v1 = vmem(r29+#3)
109+
; CHECK-NEXT: } // 256-byte Folded Reload
110+
; CHECK-NEXT: {
111+
; CHECK-NEXT: v1:0.uw = vrmpy(v1:0.ub,r0.ub,#1)
112+
; CHECK-NEXT: r0 = ##g3
113+
; CHECK-NEXT: vmem(r19+#0) = v1.new
114+
; CHECK-NEXT: }
115+
; CHECK-NEXT: {
116+
; CHECK-NEXT: call f1
117+
; CHECK-NEXT: vmem(r20+#0) = v0
118+
; CHECK-NEXT: }
119+
; CHECK-NEXT: {
120+
; CHECK-NEXT: v0 = vmem(r29+#6)
121+
; CHECK-NEXT: } // 256-byte Folded Reload
122+
; CHECK-NEXT: {
123+
; CHECK-NEXT: v1 = vmem(r29+#7)
124+
; CHECK-NEXT: } // 256-byte Folded Reload
125+
; CHECK-NEXT: {
126+
; CHECK-NEXT: v1:0.uw = vrmpy(v1:0.ub,r18.ub,#1)
127+
; CHECK-NEXT: vmem(r19+#0) = v1.new
128+
; CHECK-NEXT: }
129+
; CHECK-NEXT: {
130+
; CHECK-NEXT: call f0
131+
; CHECK-NEXT: vmem(r20+#0) = v0
132+
; CHECK-NEXT: }
133+
; CHECK-NEXT: {
134+
; CHECK-NEXT: r0 = #0
135+
; CHECK-NEXT: v0 = vmem(r29+#6)
136+
; CHECK-NEXT: } // 256-byte Folded Reload
137+
; CHECK-NEXT: {
138+
; CHECK-NEXT: v1 = vmem(r29+#7)
139+
; CHECK-NEXT: } // 256-byte Folded Reload
140+
; CHECK-NEXT: {
141+
; CHECK-NEXT: v1:0.uw = vrmpy(v1:0.ub,r0.ub,#1)
142+
; CHECK-NEXT: vmem(r19+#0) = v1.new
143+
; CHECK-NEXT: }
144+
; CHECK-NEXT: {
145+
; CHECK-NEXT: call f0
146+
; CHECK-NEXT: vmem(r20+#0) = v0
147+
; CHECK-NEXT: }
148+
; CHECK-NEXT: {
149+
; CHECK-NEXT: r0 = add(r29,#2048)
150+
; CHECK-NEXT: v1 = vmem(r29+#5)
151+
; CHECK-NEXT: } // 128-byte Folded Reload
152+
; CHECK-NEXT: {
153+
; CHECK-NEXT: v0 = vmem(r0+#-7)
154+
; CHECK-NEXT: } // 128-byte Folded Reload
155+
; CHECK-NEXT: {
156+
; CHECK-NEXT: v1:0.h = vadd(v0.ub,v1.ub)
157+
; CHECK-NEXT: }
158+
; CHECK-NEXT: {
159+
; CHECK-NEXT: v1:0.uw = vrmpy(v1:0.ub,r16.ub,#1)
160+
; CHECK-NEXT: r17:16 = memd(r30+#-8)
161+
; CHECK-NEXT: vmem(r19+#0) = v1.new
162+
; CHECK-NEXT: } // 8-byte Folded Reload
163+
; CHECK-NEXT: {
164+
; CHECK-NEXT: r19:18 = memd(r30+#-16)
165+
; CHECK-NEXT: vmem(r20+#0) = v0
166+
; CHECK-NEXT: } // 8-byte Folded Reload
167+
; CHECK-NEXT: {
168+
; CHECK-NEXT: r21:20 = memd(r30+#-24)
169+
; CHECK-NEXT: r31:30 = dealloc_return(r30):raw
170+
; CHECK-NEXT: } // 8-byte Folded Reload
171+
b0:
172+
%v0 = alloca <32 x i32>, align 128
173+
%v1 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1)
174+
%v2 = call <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32> %v1, i32 16843009)
175+
%v3 = call <32 x i32> @llvm.hexagon.V6.vandqrt.128B(<128 x i1> %v2, i32 -1)
176+
store <32 x i32> %v3, <32 x i32>* %v0, align 128
177+
%v4 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2)
178+
%v5 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v4)
179+
%v6 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v5, i32 -2147483648, i32 0)
180+
store <64 x i32> %v6, <64 x i32>* @g0, align 128
181+
call void (i8*, ...) @f1(i8* getelementptr inbounds ([110 x i8], [110 x i8]* @g1, i32 0, i32 0)) #2
182+
%v7 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1)
183+
%v8 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v7, <32 x i32> undef)
184+
%v9 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v8, i32 -1, i32 0)
185+
store <64 x i32> %v9, <64 x i32>* @g0, align 128
186+
call void (i8*, ...) @f1(i8* getelementptr inbounds ([102 x i8], [102 x i8]* @g2, i32 0, i32 0)) #2
187+
%v10 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1)
188+
%v11 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v10, <32 x i32> undef)
189+
%v12 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v11, i32 2147483647, i32 1)
190+
store <64 x i32> %v12, <64 x i32>* @g0, align 128
191+
call void (i8*, ...) @f1(i8* getelementptr inbounds ([110 x i8], [110 x i8]* @g3, i32 0, i32 0)) #2
192+
%v13 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2)
193+
%v14 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v13)
194+
%v15 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v14, i32 -2147483648, i32 1)
195+
store <64 x i32> %v15, <64 x i32>* @g0, align 128
196+
call void @f0() #2
197+
%v16 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2)
198+
%v17 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v16)
199+
%v18 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v17, i32 0, i32 1)
200+
store <64 x i32> %v18, <64 x i32>* @g0, align 128
201+
call void @f0() #2
202+
%v19 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1)
203+
%v20 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2)
204+
%v21 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v19, <32 x i32> %v20)
205+
%v22 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v21, i32 1, i32 1)
206+
store <64 x i32> %v22, <64 x i32>* @g0, align 128
207+
ret void
208+
}
209+
210+
attributes #0 = { nounwind "use-soft-float"="false" "target-cpu"="hexagonv66" "target-features"="+hvxv66,+hvx-length128b" }
211+
attributes #1 = { nounwind readnone }
212+
attributes #2 = { nounwind optsize }

0 commit comments

Comments
 (0)