Skip to content

Commit cc8b901

Browse files
committed
[lld] Support thumb PLTs
We are using PLTs for cortex-m33 which only supports thumb.
1 parent 7564566 commit cc8b901

File tree

4 files changed

+257
-53
lines changed

4 files changed

+257
-53
lines changed

lld/ELF/Arch/ARM.cpp

Lines changed: 123 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -231,36 +231,71 @@ static void writePltHeaderLong(uint8_t *buf) {
231231
// The default PLT header requires the .got.plt to be within 128 Mb of the
232232
// .plt in the positive direction.
233233
void ARM::writePltHeader(uint8_t *buf) const {
234-
// Use a similar sequence to that in writePlt(), the difference is the calling
235-
// conventions mean we use lr instead of ip. The PLT entry is responsible for
236-
// saving lr on the stack, the dynamic loader is responsible for reloading
237-
// it.
238-
const uint32_t pltData[] = {
239-
0xe52de004, // L1: str lr, [sp,#-4]!
240-
0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4)
241-
0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4)
242-
0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
243-
};
244-
245-
uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
246-
if (!llvm::isUInt<27>(offset)) {
247-
// We cannot encode the Offset, use the long form.
248-
writePltHeaderLong(buf);
249-
return;
234+
if (!config->armThumbPLTs) {
235+
// Use a similar sequence to that in writePlt(), the difference is the calling
236+
// conventions mean we use lr instead of ip. The PLT entry is responsible for
237+
// saving lr on the stack, the dynamic loader is responsible for reloading
238+
// it.
239+
const uint32_t pltData[] = {
240+
0xe52de004, // L1: str lr, [sp,#-4]!
241+
0xe28fe600, // add lr, pc, #0x0NN00000 &(.got.plt - L1 - 4)
242+
0xe28eea00, // add lr, lr, #0x000NN000 &(.got.plt - L1 - 4)
243+
0xe5bef000, // ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
244+
};
245+
246+
uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
247+
if (!llvm::isUInt<27>(offset)) {
248+
// We cannot encode the Offset, use the long form.
249+
writePltHeaderLong(buf);
250+
return;
251+
}
252+
write32(buf + 0, pltData[0]);
253+
write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
254+
write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
255+
write32(buf + 12, pltData[3] | (offset & 0xfff));
256+
memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
257+
memcpy(buf + 20, trapInstr.data(), 4);
258+
memcpy(buf + 24, trapInstr.data(), 4);
259+
memcpy(buf + 28, trapInstr.data(), 4);
260+
} else {
261+
// The instruction sequence for thumb:
262+
//
263+
// 0: b500 push {lr}
264+
// 2: f8df e008 ldr.w lr, [pc, #0x8] @ 0xe <func+0xe>
265+
// 6: 44fe add lr, pc
266+
// 8: f85e ff08 ldr pc, [lr, #8]!
267+
// e: .word .got.plt - .plt - 16
268+
//
269+
// At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from
270+
// `pc` in the add instruction and 8 bytes for the `lr` adjustment.
271+
//
272+
uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16;
273+
assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
274+
write16(buf + 0, 0xb500);
275+
// Split into two halves to support endianness correctly.
276+
write16(buf + 2, 0xf8df);
277+
write16(buf + 4, 0xe008);
278+
write16(buf + 6, 0x44fe);
279+
// Split into two halves to support endianness correctly.
280+
write16(buf + 8, 0xf85e);
281+
write16(buf + 10, 0xff08);
282+
write32(buf + 12, offset);
283+
284+
memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
285+
memcpy(buf + 20, trapInstr.data(), 4);
286+
memcpy(buf + 24, trapInstr.data(), 4);
287+
memcpy(buf + 28, trapInstr.data(), 4);
250288
}
251-
write32(buf + 0, pltData[0]);
252-
write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
253-
write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
254-
write32(buf + 12, pltData[3] | (offset & 0xfff));
255-
memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
256-
memcpy(buf + 20, trapInstr.data(), 4);
257-
memcpy(buf + 24, trapInstr.data(), 4);
258-
memcpy(buf + 28, trapInstr.data(), 4);
259289
}
260290

261291
void ARM::addPltHeaderSymbols(InputSection &isec) const {
262-
addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
263-
addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
292+
if (!config->armThumbPLTs) {
293+
addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
294+
addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
295+
} else {
296+
addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec);
297+
addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec);
298+
}
264299
}
265300

266301
// Long form PLT entries that do not have any restrictions on the displacement
@@ -279,32 +314,65 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr,
279314
// .plt in the positive direction.
280315
void ARM::writePlt(uint8_t *buf, const Symbol &sym,
281316
uint64_t pltEntryAddr) const {
282-
// The PLT entry is similar to the example given in Appendix A of ELF for
283-
// the Arm Architecture. Instead of using the Group Relocations to find the
284-
// optimal rotation for the 8-bit immediate used in the add instructions we
285-
// hard code the most compact rotations for simplicity. This saves a load
286-
// instruction over the long plt sequences.
287-
const uint32_t pltData[] = {
288-
0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8
289-
0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8
290-
0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
291-
};
292317

293-
uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
294-
if (!llvm::isUInt<27>(offset)) {
295-
// We cannot encode the Offset, use the long form.
296-
writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
297-
return;
318+
if (!config->armThumbPLTs) {
319+
uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
320+
321+
// The PLT entry is similar to the example given in Appendix A of ELF for
322+
// the Arm Architecture. Instead of using the Group Relocations to find the
323+
// optimal rotation for the 8-bit immediate used in the add instructions we
324+
// hard code the most compact rotations for simplicity. This saves a load
325+
// instruction over the long plt sequences.
326+
const uint32_t pltData[] = {
327+
0xe28fc600, // L1: add ip, pc, #0x0NN00000 Offset(&(.got.plt) - L1 - 8
328+
0xe28cca00, // add ip, ip, #0x000NN000 Offset(&(.got.plt) - L1 - 8
329+
0xe5bcf000, // ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
330+
};
331+
if (!llvm::isUInt<27>(offset)) {
332+
// We cannot encode the Offset, use the long form.
333+
writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
334+
return;
335+
}
336+
write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
337+
write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
338+
write32(buf + 8, pltData[2] | (offset & 0xfff));
339+
memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
340+
} else {
341+
uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12;
342+
assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
343+
344+
// A PLT entry will be:
345+
//
346+
// movw ip, #<lower 16 bits>
347+
// movt ip, #<upper 16 bits>
348+
// add ip, pc
349+
// L1: ldr.w pc, [ip]
350+
// b L1
351+
//
352+
// where ip = r12 = 0xc
353+
354+
// movw ip, #<lower 16 bits>
355+
write16(buf + 2, 0x0c00); // use `ip`
356+
relocateNoSym(buf, R_ARM_THM_MOVW_ABS_NC, offset);
357+
358+
// movt ip, #<upper 16 bits>
359+
write16(buf + 6, 0x0c00); // use `ip`
360+
relocateNoSym(buf + 4, R_ARM_THM_MOVT_ABS, offset);
361+
362+
write16(buf + 8, 0x44fc); // add ip, pc
363+
write16(buf + 10, 0xf8dc); // ldr.w pc, [ip] (bottom half)
364+
write16(buf + 12, 0xf000); // ldr.w pc, [ip] (upper half)
365+
write16(buf + 14, 0xe7fc); // Branch to previous instruction
298366
}
299-
write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
300-
write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
301-
write32(buf + 8, pltData[2] | (offset & 0xfff));
302-
memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
303367
}
304368

305369
void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
306-
addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
307-
addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
370+
if (!config->armThumbPLTs) {
371+
addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
372+
addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
373+
} else {
374+
addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec);
375+
}
308376
}
309377

310378
bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
@@ -325,6 +393,8 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
325393
case R_ARM_JUMP24:
326394
// Source is ARM, all PLT entries are ARM so no interworking required.
327395
// Otherwise we need to interwork if STT_FUNC Symbol has bit 0 set (Thumb).
396+
assert(!config->armThumbPLTs &&
397+
"If the source is ARM, we should not need Thumb PLTs");
328398
if (s.isFunc() && expr == R_PC && (s.getVA() & 1))
329399
return true;
330400
[[fallthrough]];
@@ -335,9 +405,9 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
335405
}
336406
case R_ARM_THM_JUMP19:
337407
case R_ARM_THM_JUMP24:
338-
// Source is Thumb, all PLT entries are ARM so interworking is required.
408+
// Source is Thumb, when all PLT entries are ARM interworking is required.
339409
// Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM).
340-
if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0))
410+
if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0))
341411
return true;
342412
[[fallthrough]];
343413
case R_ARM_THM_CALL: {
@@ -547,7 +617,6 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
547617
// STT_FUNC we choose whether to write a BL or BLX depending on the
548618
// value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is
549619
// not of type STT_FUNC then we must preserve the original instruction.
550-
// PLT entries are always ARM state so we know we don't need to interwork.
551620
assert(rel.sym); // R_ARM_CALL is always reached via relocate().
552621
bool bit0Thumb = val & 1;
553622
bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000;
@@ -606,12 +675,13 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
606675
// PLT entries are always ARM state so we know we need to interwork.
607676
assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate().
608677
bool bit0Thumb = val & 1;
678+
bool useThumb = bit0Thumb || config->armThumbPLTs;
609679
bool isBlx = (read16(loc + 2) & 0x1000) == 0;
610680
// lld 10.0 and before always used bit0Thumb when deciding to write a BLX
611-
// even when type not STT_FUNC. PLT entries generated by LLD are always ARM.
612-
if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb)
681+
// even when type not STT_FUNC.
682+
if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb)
613683
stateChangeWarning(loc, rel.type, *rel.sym);
614-
if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) {
684+
if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) {
615685
// We are writing a BLX. Ensure BLX destination is 4-byte aligned. As
616686
// the BLX instruction may only be two byte aligned. This must be done
617687
// before overflow check.

lld/ELF/Config.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ struct Config {
212212
bool allowMultipleDefinition;
213213
bool fatLTOObjects;
214214
bool androidPackDynRelocs = false;
215+
bool armThumbPLTs = false;
215216
bool armHasBlx = false;
216217
bool armHasMovtMovw = false;
217218
bool armJ1J2BranchEncoding = false;

lld/ELF/InputFiles.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,18 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) {
194194
if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base &&
195195
profile == ARMBuildAttrs::MicroControllerProfile)
196196
config->armCMSESupport = true;
197+
198+
// The thumb PLT entries require Thumb2 which can be used on multiple archs.
199+
// For now, let's limit it to ones where ARM isn't available and we know have
200+
// Thumb2.
201+
std::optional<unsigned> armISA =
202+
attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use);
203+
std::optional<unsigned> thumb =
204+
attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
205+
bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed;
206+
bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32;
207+
if (noArmISA && hasThumb2)
208+
config->armThumbPLTs = true;
197209
}
198210

199211
InputFile::InputFile(Kind k, MemoryBufferRef m)

lld/test/ELF/armv8-thumb-plt-reloc.s

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// REQUIRES: arm
2+
// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1
3+
// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2
4+
// RUN: ld.lld %t1 %t2 -o %t
5+
// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
6+
// RUN: ld.lld -shared %t1 %t2 -o %t.so
7+
// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
8+
// RUN: llvm-readobj -S -r %t.so | FileCheck -check-prefix=DSOREL %s
9+
10+
// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1.be
11+
// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumbeb --mcpu=cortex-m33 %s -o %t2.be
12+
// RUN: ld.lld %t1.be %t2.be -o %t.be
13+
// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
14+
// RUN: ld.lld -shared %t1.be %t2.be -o %t.so.be
15+
// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
16+
// RUN: llvm-readobj -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
17+
18+
// RUN: ld.lld --be8 %t1.be %t2.be -o %t.be
19+
// RUN: llvm-objdump --no-print-imm-hex -d %t.be | FileCheck %s
20+
// RUN: ld.lld --be8 -shared %t1.be %t2.be -o %t.so.be
21+
// RUN: llvm-objdump --no-print-imm-hex -d %t.so.be | FileCheck --check-prefix=DSO %s
22+
// RUN: llvm-readobj -S -r %t.so.be | FileCheck -check-prefix=DSOREL %s
23+
24+
// Test PLT entry generation
25+
.text
26+
.align 2
27+
.globl _start
28+
.type _start,%function
29+
_start:
30+
// FIXME, interworking is only supported for BL via BLX at the moment, when
31+
// interworking thunks are available for b.w and b<cond>.w this can be altered
32+
// to test the different forms of interworking.
33+
bl func1
34+
bl func2
35+
bl func3
36+
37+
// Executable, expect no PLT
38+
// CHECK: Disassembly of section .text:
39+
// CHECK-EMPTY:
40+
// CHECK-NEXT: <func1>:
41+
// CHECK-NEXT: 200b4: 4770 bx lr
42+
// CHECK: <func2>:
43+
// CHECK-NEXT: 200b6: 4770 bx lr
44+
// CHECK: <func3>:
45+
// CHECK-NEXT: 200b8: 4770 bx lr
46+
// CHECK-NEXT: 200ba: d4d4
47+
// CHECK: <_start>:
48+
// CHECK-NEXT: 200bc: f7ff fffa bl 0x200b4 <func1>
49+
// CHECK-NEXT: 200c0: f7ff fff9 bl 0x200b6 <func2>
50+
// CHECK-NEXT: 200c4: f7ff fff8 bl 0x200b8 <func3>
51+
52+
// Expect PLT entries as symbols can be preempted
53+
// .text is Thumb and .plt is ARM, llvm-objdump can currently only disassemble
54+
// as ARM or Thumb. Work around by disassembling twice.
55+
// DSO: Disassembly of section .text:
56+
// DSO-EMPTY:
57+
// DSO-NEXT: <func1>:
58+
// DSO-NEXT: 10214: 4770 bx lr
59+
// DSO: <func2>:
60+
// DSO-NEXT: 10216: 4770 bx lr
61+
// DSO: <func3>:
62+
// DSO-NEXT: 10218: 4770 bx lr
63+
// DSO-NEXT: 1021a: d4d4
64+
// DSO: <_start>:
65+
// 0x10250 = PLT func1
66+
// DSO-NEXT: 1021c: f000 f818 bl 0x10250
67+
// 0x10260 = PLT func2
68+
// DSO-NEXT: 10220: f000 f81e bl 0x10260
69+
// 0x10270 = PLT func3
70+
// DSO-NEXT: 10224: f000 f824 bl 0x10270
71+
// DSO: Disassembly of section .plt:
72+
// DSO-EMPTY:
73+
// DSO-NEXT: <.plt>:
74+
// DSO-NEXT: 10230: b500 push {lr}
75+
// DSO-NEXT: 10232: f8df e008 ldr.w lr, [pc, #8]
76+
// DSO-NEXT: 10236: 44fe add lr, pc
77+
// DSO-NEXT: 10238: f85e ff08 ldr pc, [lr, #8]!
78+
// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8
79+
// DSO-NEXT: 1023c: {{.*}} .word 0x00020098
80+
// DSO-NEXT: 10240: d4 d4 d4 d4 .word 0xd4d4d4d4
81+
// DSO-NEXT: 10244: d4 d4 d4 d4 .word 0xd4d4d4d4
82+
// DSO-NEXT: 10248: d4 d4 d4 d4 .word 0xd4d4d4d4
83+
// DSO-NEXT: 1024c: d4 d4 d4 d4 .word 0xd4d4d4d4
84+
85+
// 136 + 2 << 16 + 0x1025c = 0x302e4 = got entry 1
86+
// DSO-NEXT: 10250: f240 0c88 movw r12, #136
87+
// DSO-NEXT: 10254: f2c0 0c02 movt r12, #2
88+
// DSO-NEXT: 10258: 44fc add r12, pc
89+
// DSO-NEXT: 1025a: f8dc f000 ldr.w pc, [r12]
90+
// DSO-NEXT: 1025e: e7fc b 0x1025a
91+
// 124 + 2 << 16 + 0x1026c = 0x302e8 = got entry 2
92+
// DSO-NEXT: 10260: f240 0c7c movw r12, #124
93+
// DSO-NEXT: 10264: f2c0 0c02 movt r12, #2
94+
// DSO-NEXT: 10268: 44fc add r12, pc
95+
// DSO-NEXT: 1026a: f8dc f000 ldr.w pc, [r12]
96+
// DSO-NEXT: 1026e: e7fc b 0x1026a
97+
// 112 + 2 << 16 + 0x1027c = 0x302ec = got entry 3
98+
// DSO-NEXT: 10270: f240 0c70 movw r12, #112
99+
// DSO-NEXT: 10274: f2c0 0c02 movt r12, #2
100+
// DSO-NEXT: 10278: 44fc add r12, pc
101+
// DSO-NEXT: 1027a: f8dc f000 ldr.w pc, [r12]
102+
// DSO-NEXT: 1027e: e7fc b 0x1027a
103+
104+
// DSOREL: Name: .got.plt
105+
// DSOREL-NEXT: Type: SHT_PROGBITS
106+
// DSOREL-NEXT: Flags [
107+
// DSOREL-NEXT: SHF_ALLOC
108+
// DSOREL-NEXT: SHF_WRITE
109+
// DSOREL-NEXT: ]
110+
// DSOREL-NEXT: Address: 0x302D8
111+
// DSOREL-NEXT: Offset:
112+
// DSOREL-NEXT: Size: 24
113+
// DSOREL-NEXT: Link:
114+
// DSOREL-NEXT: Info:
115+
// DSOREL-NEXT: AddressAlignment: 4
116+
// DSOREL-NEXT: EntrySize:
117+
// DSOREL: Relocations [
118+
// DSOREL-NEXT: Section (5) .rel.plt {
119+
// DSOREL-NEXT: 0x302E4 R_ARM_JUMP_SLOT func1
120+
// DSOREL-NEXT: 0x302E8 R_ARM_JUMP_SLOT func2
121+
// DSOREL-NEXT: 0x302EC R_ARM_JUMP_SLOT func3

0 commit comments

Comments
 (0)