Skip to content

Commit f83ab2b

Browse files
[ARM] Improve generation of thumb stack accesses
Currently when a stack access is out of range of an sp-relative ldr or str then we jump straight to generating the offset with a literal pool load or mov32 pseudo-instruction. This patch improves that in two ways: * If the offset is within range of sp-relative add plus an ldr then use that. * When we use the mov32 pseudo-instruction, if putting part of the offset into the ldr will simplify the expansion of the mov32 then do so. Differential Revision: https://reviews.llvm.org/D156875
1 parent a749b32 commit f83ab2b

File tree

6 files changed

+190
-35
lines changed

6 files changed

+190
-35
lines changed

llvm/lib/Target/ARM/ThumbRegisterInfo.cpp

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,19 @@ static void emitThumbRegPlusImmInReg(
128128
const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) {
129129
MachineFunction &MF = *MBB.getParent();
130130
const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
131+
132+
// Use a single sp-relative add if the immediate is small enough.
133+
if (BaseReg == ARM::SP &&
134+
(DestReg.isVirtual() || isARMLowRegister(DestReg)) && NumBytes >= 0 &&
135+
NumBytes <= 1020 && (NumBytes % 4) == 0) {
136+
BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), DestReg)
137+
.addReg(ARM::SP)
138+
.addImm(NumBytes / 4)
139+
.add(predOps(ARMCC::AL))
140+
.setMIFlags(MIFlags);
141+
return;
142+
}
143+
131144
bool isHigh = !isARMLowRegister(DestReg) ||
132145
(BaseReg != 0 && !isARMLowRegister(BaseReg));
133146
bool isSub = false;
@@ -422,19 +435,33 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
422435
return true;
423436
}
424437

438+
// The offset doesn't fit, but we may be able to put some of the offset into
439+
// the ldr to simplify the generation of the rest of it.
425440
NumBits = 5;
426441
Mask = (1 << NumBits) - 1;
427-
428-
// If this is a thumb spill / restore, we will be using a constpool load to
429-
// materialize the offset.
430-
if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
431-
ImmOp.ChangeToImmediate(0);
432-
} else {
433-
// Otherwise, it didn't fit. Pull in what we can to simplify the immed.
434-
ImmedOffset = ImmedOffset & Mask;
435-
ImmOp.ChangeToImmediate(ImmedOffset);
436-
Offset &= ~(Mask * Scale);
442+
InstrOffs = 0;
443+
auto &ST = MF.getSubtarget<ARMSubtarget>();
444+
// If using the maximum ldr offset will put the rest into the range of a
445+
// single sp-relative add then do so.
446+
if (FrameReg == ARM::SP && Offset - (Mask * Scale) <= 1020) {
447+
InstrOffs = Mask;
448+
} else if (ST.genExecuteOnly()) {
449+
// With execute-only the offset is generated either with movw+movt or an
450+
// add+lsl sequence. If subtracting an offset will make the top half zero
451+
// then that saves a movt or lsl+add. Otherwise if we don't have movw then
452+
// we may be able to subtract a value such that it makes the bottom byte
453+
// zero, saving an add.
454+
unsigned BottomBits = (Offset / Scale) & Mask;
455+
bool CanMakeBottomByteZero = ((Offset - BottomBits * Scale) & 0xff) == 0;
456+
bool TopHalfZero = (Offset & 0xffff0000) == 0;
457+
bool CanMakeTopHalfZero = ((Offset - Mask * Scale) & 0xffff0000) == 0;
458+
if (!TopHalfZero && CanMakeTopHalfZero)
459+
InstrOffs = Mask;
460+
else if (!ST.useMovt() && CanMakeBottomByteZero)
461+
InstrOffs = BottomBits;
437462
}
463+
ImmOp.ChangeToImmediate(InstrOffs);
464+
Offset -= InstrOffs * Scale;
438465
}
439466

440467
return Offset == 0;
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
; RUN: llc -mtriple=thumbv8m.base-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefixes=CHECK,CHECK-MOVW %s
2+
; RUN: llc -mtriple=thumbv6m-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefixes=CHECK,CHECK-NOMOVW %s
3+
4+
; Largest offset that fits into sp-relative ldr
5+
; CHECK-LABEL: ldr_range_end:
6+
; CHECK: ldr {{r[0-9]+}}, [sp, #1020]
7+
define i32 @ldr_range_end() {
8+
entry:
9+
%var = alloca i32, align 4
10+
%arr = alloca [1020 x i8], align 4
11+
%0 = load i32, ptr %var, align 4
12+
ret i32 %0
13+
}
14+
15+
; Smallest offset that fits into add+ldr
16+
; CHECK-LABEL: add_ldr_range_start:
17+
; CHECK: add [[REG:r[0-9]+]], sp, #900
18+
; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
19+
define i32 @add_ldr_range_start() {
20+
entry:
21+
%var = alloca i32, align 4
22+
%arr = alloca [1024 x i8], align 4
23+
%0 = load i32, ptr %var, align 4
24+
ret i32 %0
25+
}
26+
27+
; Largest offset that fits into add+ldr
28+
; CHECK-LABEL: add_ldr_range_end:
29+
; CHECK: add [[REG:r[0-9]+]], sp, #1020
30+
; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
31+
define i32 @add_ldr_range_end() {
32+
entry:
33+
%var = alloca i32, align 4
34+
%arr = alloca [1144 x i8], align 4
35+
%0 = load i32, ptr %var, align 4
36+
ret i32 %0
37+
}
38+
39+
; Smallest offset where we start using mov32. If we don't have movw then using
40+
; an ldr offset means we save an add.
41+
; CHECK-LABEL: mov32_range_start:
42+
; CHECK-MOVW: movw [[REG:r[0-9]+]], #1148
43+
; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #4
44+
; CHECK-NOMOVW-NEXT: lsls [[REG]], [[REG]], #8
45+
; CHECK-NEXT: add [[REG]], sp
46+
; CHECK-MOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]]]
47+
; CHECK-NOMOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
48+
define i32 @mov32_range_start() {
49+
entry:
50+
%var = alloca i32, align 4
51+
%arr = alloca [1148 x i8], align 4
52+
%0 = load i32, ptr %var, align 4
53+
ret i32 %0
54+
}
55+
56+
; Here using an ldr offset doesn't save an add so we shouldn't do it.
57+
; CHECK-LABEL: mov32_range_next:
58+
; CHECK-MOVW: movw [[REG:r[0-9]+]], #1152
59+
; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #4
60+
; CHECK-NOMOVW-NEXT: lsls [[REG]], [[REG]], #8
61+
; CHECK-NOMOVW-NEXT: adds [[REG]], #128
62+
; CHECK-NEXT: add [[REG]], sp
63+
; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]]]
64+
define i32 @mov32_range_next() {
65+
entry:
66+
%var = alloca i32, align 4
67+
%arr = alloca [1152 x i8], align 4
68+
%0 = load i32, ptr %var, align 4
69+
ret i32 %0
70+
}
71+
72+
; Smallest offset where using an ldr offset prevents needing a movt or lsl+add
73+
; CHECK-LABEL: can_clear_top_byte_start:
74+
; CHECK: add sp, {{r[0-9]+}}
75+
; CHECK-MOVW: movw [[REG:r[0-9]+]], #65412
76+
; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #255
77+
; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #8
78+
; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #132
79+
; CHECK-NEXT: add [[REG]], sp
80+
; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
81+
define i32 @can_clear_top_byte_start() {
82+
entry:
83+
%var = alloca i32, align 4
84+
%arr = alloca [65536 x i8], align 4
85+
%0 = load i32, ptr %var, align 4
86+
ret i32 %0
87+
}
88+
89+
; Largest offset where using an ldr offset prevents needing a movt or lsl+add
90+
; CHECK-LABEL: can_clear_top_byte_end:
91+
; CHECK: add sp, {{r[0-9]+}}
92+
; CHECK-MOVW: movw [[REG:r[0-9]+]], #65532
93+
; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #255
94+
; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #8
95+
; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #252
96+
; CHECK-NEXT: add [[REG]], sp
97+
; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
98+
define i32 @can_clear_top_byte_end() {
99+
entry:
100+
%var = alloca i32, align 4
101+
%arr = alloca [65656 x i8], align 4
102+
%0 = load i32, ptr %var, align 4
103+
ret i32 %0
104+
}
105+
106+
; Smallest offset where using an ldr offset doesn't clear the top byte, though
107+
; we can use an ldr offset if not using movt to save an add of the low byte.
108+
; CHECK-LABEL: cant_clear_top_byte_start:
109+
; CHECK: add sp, {{r[0-9]+}}
110+
; CHECK-MOVW: movw [[REG:r[0-9]+]], #124
111+
; CHECK-MOVW-NEXT: movt [[REG:r[0-9]+]], #1
112+
; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #1
113+
; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #16
114+
; CHECK-NEXT: add [[REG]], sp
115+
; CHECK-MOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]]]
116+
; CHECK-NOMOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124]
117+
define i32 @cant_clear_top_byte_start() {
118+
entry:
119+
%var = alloca i32, align 4
120+
%arr = alloca [65660 x i8], align 4
121+
%0 = load i32, ptr %var, align 4
122+
ret i32 %0
123+
}
124+
125+
; An ldr offset doesn't help for anything, so we shouldn't do it.
126+
; CHECK-LABEL: cant_clear_top_byte_next:
127+
; CHECK: add sp, {{r[0-9]+}}
128+
; CHECK-MOVW: movw [[REG:r[0-9]+]], #128
129+
; CHECK-MOVW: movt [[REG:r[0-9]+]], #1
130+
; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #1
131+
; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #16
132+
; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #128
133+
; CHECK-NEXT: add [[REG]], sp
134+
; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]]]
135+
define i32 @cant_clear_top_byte_next() {
136+
entry:
137+
%var = alloca i32, align 4
138+
%arr = alloca [65664 x i8], align 4
139+
%0 = load i32, ptr %var, align 4
140+
ret i32 %0
141+
}

llvm/test/CodeGen/ARM/large-stack.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ define i32 @test3() {
4444
;; are we choosing correct store/tSTRspi pattern for execute-only
4545
; CHECK: movs [[REG:r[0-9]+]], #0x30
4646
; CHECK-NEXT: lsls [[REG]], [[REG]], #0x18
47-
; CHECK-NEXT: adds [[REG]], #0x8
47+
; CHECK-NEXT: add [[REG]], sp
48+
; CHECK-NEXT: str {{r[0-9]+}}, [[[REG]], #0x8]
4849
%tmp1 = load i32, ptr %tmp
4950
ret i32 %tmp1
5051
}

llvm/test/CodeGen/Thumb/emergency-spill-slot.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,18 +176,13 @@ define void @arg_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, ptr byval([2
176176
; CHECK-NEXT: @APP
177177
; CHECK-NEXT: @NO_APP
178178
; CHECK-NEXT: str r0, [sp]
179-
; CHECK-NEXT: ldr r0, .LCPI3_0
180-
; CHECK-NEXT: add r0, sp
181-
; CHECK-NEXT: str r5, [r0]
179+
; CHECK-NEXT: add r0, sp, #904
180+
; CHECK-NEXT: str r5, [r0, #124]
182181
; CHECK-NEXT: ldr r0, [sp]
183182
; CHECK-NEXT: @APP
184183
; CHECK-NEXT: @NO_APP
185184
; CHECK-NEXT: add sp, #4
186185
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
187-
; CHECK-NEXT: .p2align 2
188-
; CHECK-NEXT: @ %bb.1:
189-
; CHECK-NEXT: .LCPI3_0:
190-
; CHECK-NEXT: .long 1028 @ 0x404
191186
entry:
192187
%asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"(ptr %p, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef)
193188
%asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0

llvm/test/CodeGen/Thumb/stack-access.ll

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,19 +110,18 @@ define void @test8() {
110110
store i32 1, ptr %arr2, align 4
111111

112112
; %arr2 is in range, but this element of it is not
113-
; CHECK-DAG: ldr [[RA:r[0-9]+]], .LCPI7_2
114-
; CHECK-DAG: add [[RA]], sp
115-
; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
113+
; CHECK-DAG: add [[RA:r[0-9]+]], sp, #900
114+
; CHECK-DAG: str [[REG]], [{{r[0-9]+}}, #124]
116115
%arr2idx2 = getelementptr inbounds [224 x i32], ptr %arr2, i32 0, i32 32
117116
store i32 1, ptr %arr2idx2, align 4
118117

119118
; %arr3 is not in range
120-
; CHECK-DAG: ldr [[RB:r[0-9]+]], .LCPI7_3
119+
; CHECK-DAG: ldr [[RB:r[0-9]+]], .LCPI7_2
121120
; CHECK-DAG: add [[RB]], sp
122121
; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
123122
store i32 1, ptr %arr3, align 4
124123

125-
; CHECK-DAG: ldr [[RC:r[0-9]+]], .LCPI7_4
124+
; CHECK-DAG: ldr [[RC:r[0-9]+]], .LCPI7_3
126125
; CHECK-DAG: add [[RC]], sp
127126
; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
128127
%arr3idx2 = getelementptr inbounds [224 x i32], ptr %arr3, i32 0, i32 32

llvm/test/CodeGen/Thumb/stack_guard_remat.ll

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,38 +3,30 @@
33
; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC
44

55
;PIC: foo2
6-
;PIC: ldr [[SAVED_GUARD:r[0-9]+]], [[GUARD_STACK_OFFSET:LCPI[0-9_]+]]
7-
;PIC-NEXT: add [[SAVED_GUARD]], sp
8-
;PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]]]
6+
;PIC: add [[SAVED_GUARD:r[0-9]+]], sp, #904
7+
;PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]], #124]
98
;PIC-NEXT: ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]]
109
;PIC-NEXT: [[LABEL1:LPC[0-9_]+]]:
1110
;PIC-NEXT: add [[ORIGINAL_GUARD]], pc
1211
;PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
1312
;PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
1413
;PIC-NEXT: cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]]
1514

16-
;PIC: [[GUARD_STACK_OFFSET]]:
17-
;PIC-NEXT: .long 1028
1815
;PIC: [[ORIGINAL_GUARD_LABEL]]:
1916
;PIC-NEXT: .long L___stack_chk_guard$non_lazy_ptr-([[LABEL1]]+4)
2017

2118
;NO-PIC: foo2
22-
;NO-PIC: ldr [[SAVED_GUARD:r[0-9]+]], [[GUARD_STACK_OFFSET:LCPI[0-9_]+]]
23-
;NO-PIC-NEXT: add [[SAVED_GUARD]], sp
24-
;NO-PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]]]
19+
;NO-PIC: add [[SAVED_GUARD:r[0-9]+]], sp, #904
20+
;NO-PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]], #124]
2521
;NO-PIC-NEXT: ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]]
2622
;NO-PIC-NOT: LPC
2723
;NO-PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
2824
;DYNAMIC-NO-PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
2925
;NO-PIC-NEXT: cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]]
3026

31-
;STATIC: [[GUARD_STACK_OFFSET]]:
32-
;STATIC-NEXT: .long 1028
3327
;STATIC: [[ORIGINAL_GUARD_LABEL]]:
3428
;STATIC-NEXT: .long ___stack_chk_guard
3529

36-
;DYNAMIC-NO-PIC: [[GUARD_STACK_OFFSET]]:
37-
;DYNAMIC-NO-PIC-NEXT: .long 1028
3830
;DYNAMIC-NO-PIC: [[ORIGINAL_GUARD_LABEL]]:
3931
;DYNAMIC-NO-PIC-NEXT: .long L___stack_chk_guard$non_lazy_ptr
4032

0 commit comments

Comments
 (0)