[lld] Support thumb PLTs

PiJoules · PiJoules · commit d6c9b782da04 · 2024-03-22T17:03:13.000-07:00
We are using PLTs for cortex-m33 which only supports thumb.
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
@@ -231,36 +231,67 @@ static void writePltHeaderLong(uint8_t *buf) {
 // The default PLT header requires the .got.plt to be within 128 Mb of the
 // .plt in the positive direction.
 void ARM::writePltHeader(uint8_t *buf) const {
-  // Use a similar sequence to that in writePlt(), the difference is the calling
-  // conventions mean we use lr instead of ip. The PLT entry is responsible for
-  // saving lr on the stack, the dynamic loader is responsible for reloading
-  // it.
-  const uint32_t pltData[] = {
-      0xe52de004, // L1: str lr, [sp,#-4]!
-      0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
-      0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
-      0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
-  };
-
-  uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
-  if (!llvm::isUInt<27>(offset)) {
-    // We cannot encode the Offset, use the long form.
-    writePltHeaderLong(buf);
-    return;
+  if (!config->armThumbPLTs) {
+    // Use a similar sequence to that in writePlt(), the difference is the calling
+    // conventions mean we use lr instead of ip. The PLT entry is responsible for
+    // saving lr on the stack, the dynamic loader is responsible for reloading
+    // it.
+    const uint32_t pltData[] = {
+        0xe52de004, // L1: str lr, [sp,#-4]!
+        0xe28fe600, //     add lr, pc,  #0x0NN00000 &(.got.plt - L1 - 4)
+        0xe28eea00, //     add lr, lr,  #0x000NN000 &(.got.plt - L1 - 4)
+        0xe5bef000, //     ldr pc, [lr, #0x00000NNN] &(.got.plt -L1 - 4)
+    };
+
+    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 4;
+    if (!llvm::isUInt<27>(offset)) {
+      // We cannot encode the Offset, use the long form.
+      writePltHeaderLong(buf);
+      return;
+    }
+    write32(buf + 0, pltData[0]);
+    write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
+    write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
+    write32(buf + 12, pltData[3] | (offset & 0xfff));
+    memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
+    memcpy(buf + 20, trapInstr.data(), 4);
+    memcpy(buf + 24, trapInstr.data(), 4);
+    memcpy(buf + 28, trapInstr.data(), 4);
+  } else {
+    // The instruction sequence for thumb:
+    //
+    // 0: b500          push    {lr}
+    // 2: f8df e008     ldr.w   lr, [pc, #0x8]          @ 0xe <func+0xe>
+    // 6: 44fe          add     lr, pc
+    // 8: f85e ff08     ldr     pc, [lr, #8]!
+    // e:               .word   .got.plt - .plt - 16
+    //
+    // At 0x8, we want to jump to .got.plt, the -16 accounts for 8 bytes from
+    // `pc` in the add instruction and 8 bytes for the `lr` adjustment.
+    //
+    uint64_t offset = in.gotPlt->getVA() - in.plt->getVA() - 16;
+    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
+    write16(buf + 0, 0xb500);
+    write32(buf + 2, 0xe008f8df);
+    write16(buf + 6, 0x44fe);
+    write32(buf + 8, 0xff08f85e);
+    write32(buf + 12, offset);
+
+    memcpy(buf + 16, trapInstr.data(), 4);  // Pad to 32-byte boundary
+    memcpy(buf + 20, trapInstr.data(), 4);
+    memcpy(buf + 24, trapInstr.data(), 4);
+    memcpy(buf + 28, trapInstr.data(), 4);
   }
-  write32(buf + 0, pltData[0]);
-  write32(buf + 4, pltData[1] | ((offset >> 20) & 0xff));
-  write32(buf + 8, pltData[2] | ((offset >> 12) & 0xff));
-  write32(buf + 12, pltData[3] | (offset & 0xfff));
-  memcpy(buf + 16, trapInstr.data(), 4); // Pad to 32-byte boundary
-  memcpy(buf + 20, trapInstr.data(), 4);
-  memcpy(buf + 24, trapInstr.data(), 4);
-  memcpy(buf + 28, trapInstr.data(), 4);
 }
 
 void ARM::addPltHeaderSymbols(InputSection &isec) const {
-  addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
-  addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
+  if (!config->armThumbPLTs) {
+    addSyntheticLocal("$a", STT_NOTYPE, 0, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, 16, 0, isec);
+  } else {
+    addSyntheticLocal("$t", STT_NOTYPE, 0, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, 12, 0, isec);
+  }
 }
 
 // Long form PLT entries that do not have any restrictions on the displacement
@@ -279,32 +310,97 @@ static void writePltLong(uint8_t *buf, uint64_t gotPltEntryAddr,
 // .plt in the positive direction.
 void ARM::writePlt(uint8_t *buf, const Symbol &sym,
                    uint64_t pltEntryAddr) const {
-  // The PLT entry is similar to the example given in Appendix A of ELF for
-  // the Arm Architecture. Instead of using the Group Relocations to find the
-  // optimal rotation for the 8-bit immediate used in the add instructions we
-  // hard code the most compact rotations for simplicity. This saves a load
-  // instruction over the long plt sequences.
-  const uint32_t pltData[] = {
-      0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
-      0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
-      0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
-  };
 
-  uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
-  if (!llvm::isUInt<27>(offset)) {
-    // We cannot encode the Offset, use the long form.
-    writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
-    return;
+  if (!config->armThumbPLTs) {
+    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 8;
+
+    // The PLT entry is similar to the example given in Appendix A of ELF for
+    // the Arm Architecture. Instead of using the Group Relocations to find the
+    // optimal rotation for the 8-bit immediate used in the add instructions we
+    // hard code the most compact rotations for simplicity. This saves a load
+    // instruction over the long plt sequences.
+    const uint32_t pltData[] = {
+        0xe28fc600, // L1: add ip, pc,  #0x0NN00000  Offset(&(.got.plt) - L1 - 8
+        0xe28cca00, //     add ip, ip,  #0x000NN000  Offset(&(.got.plt) - L1 - 8
+        0xe5bcf000, //     ldr pc, [ip, #0x00000NNN] Offset(&(.got.plt) - L1 - 8
+    };
+    if (!llvm::isUInt<27>(offset)) {
+      // We cannot encode the Offset, use the long form.
+      writePltLong(buf, sym.getGotPltVA(), pltEntryAddr);
+      return;
+    }
+    write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
+    write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
+    write32(buf + 8, pltData[2] | (offset & 0xfff));
+    memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
+  } else {
+    uint64_t offset = sym.getGotPltVA() - pltEntryAddr - 12;
+    assert(llvm::isUInt<32>(offset) && "This should always fit into a 32-bit offset");
+
+    // A PLT entry will be:
+    //
+    //       movw ip, #<lower 16 bits>
+    //       movt ip, #<upper 16 bits>
+    //       add ip, pc
+    //   L1: ldr.w pc, [ip]
+    //       b L1
+    //
+    // where ip = r12 = 0xc
+    //
+    constexpr uint32_t pltData[] = {
+        0x0c00f240, //     movw ip, <offset lower 16>
+        0x0c00f2c0, //     movt ip, <offset higher 16>
+    };
+
+    // movw encoding:
+    //
+    //   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+    //   1  1  1  1  0  i  1 0 0 1 0 0 imm4
+    //
+    //   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+    //   0  imm3     Rd0       imm8
+    //
+    //   imm16 = imm4:i:imm3:imm8, i = bit 11
+    //
+    uint16_t offset_lower = offset & 0xffff;
+    uint32_t movwImm8 = offset_lower & 0xff;
+    uint32_t movwImm3 = (offset_lower >> 8) & 0x7;
+    uint32_t movwI = (offset_lower >> 11) & 0x1;
+    uint32_t movwImm4 = (offset_lower >> 12) & 0xf;
+    uint32_t movwBits = (movwI << 10) | (movwImm4 << 0) | (movwImm3 << 28) | (movwImm8 << 16);
+    write32(buf + 0, pltData[0] | movwBits);
+
+    // movt encoding:
+    //
+    //   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+    //   1  1  1  1  0  i  1 0 1 1 0 0 imm4
+    //
+    //   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+    //   0  imm3     Rd0       imm8
+    //
+    //   imm16 = imm4:i:imm3:imm8, i = bit 11
+    //
+    uint16_t offset_upper = static_cast<uint16_t>(offset >> 16);
+    uint32_t movtImm8 = offset_upper & 0xff;
+    uint32_t movtImm3 = (offset_upper >> 8) & 0x7;
+    uint32_t movtI = (offset_upper >> 11) & 0x1;
+    uint32_t movtImm4 = (offset_upper >> 12) & 0xf;
+    uint32_t movtBits = (movtI << 10) | (movtImm4 << 0) | (movtImm3 << 28) | (movtImm8 << 16);
+    write32(buf + 4, pltData[1] | movtBits);
+
+    write16(buf + 8, 0x44fc);       // add ip, pc
+    write32(buf + 10, 0xf000f8dc);  // ldr.w   pc, [ip]
+    write16(buf + 14, 0xe7fc);      // Branch to previous instruction
   }
-  write32(buf + 0, pltData[0] | ((offset >> 20) & 0xff));
-  write32(buf + 4, pltData[1] | ((offset >> 12) & 0xff));
-  write32(buf + 8, pltData[2] | (offset & 0xfff));
-  memcpy(buf + 12, trapInstr.data(), 4); // Pad to 16-byte boundary
 }
 
 void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
-  addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
-  addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
+  if (!config->armThumbPLTs) {
+    addSyntheticLocal("$a", STT_NOTYPE, off, 0, isec);
+    addSyntheticLocal("$d", STT_NOTYPE, off + 12, 0, isec);
+  } else {
+    addSyntheticLocal("$t", STT_NOTYPE, off, 0, isec);
+  }
 }
 
 bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
@@ -337,7 +433,7 @@ bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
   case R_ARM_THM_JUMP24:
     // Source is Thumb, all PLT entries are ARM so interworking is required.
     // Otherwise we need to interwork if STT_FUNC Symbol has bit 0 clear (ARM).
-    if (expr == R_PLT_PC || (s.isFunc() && (s.getVA() & 1) == 0))
+    if ((expr == R_PLT_PC && !config->armThumbPLTs) || (s.isFunc() && (s.getVA() & 1) == 0))
       return true;
     [[fallthrough]];
   case R_ARM_THM_CALL: {
@@ -547,7 +643,6 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // STT_FUNC we choose whether to write a BL or BLX depending on the
     // value of bit 0 of Val. With bit 0 == 1 denoting Thumb. If the symbol is
     // not of type STT_FUNC then we must preserve the original instruction.
-    // PLT entries are always ARM state so we know we don't need to interwork.
     assert(rel.sym); // R_ARM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
     bool isBlx = (read32(loc) & 0xfe000000) == 0xfa000000;
@@ -606,12 +701,13 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     // PLT entries are always ARM state so we know we need to interwork.
     assert(rel.sym); // R_ARM_THM_CALL is always reached via relocate().
     bool bit0Thumb = val & 1;
+    bool useThumb = bit0Thumb || config->armThumbPLTs;
     bool isBlx = (read16(loc + 2) & 0x1000) == 0;
     // lld 10.0 and before always used bit0Thumb when deciding to write a BLX
-    // even when type not STT_FUNC. PLT entries generated by LLD are always ARM.
-    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == bit0Thumb)
+    // even when type not STT_FUNC.
+    if (!rel.sym->isFunc() && !rel.sym->isInPlt() && isBlx == useThumb)
       stateChangeWarning(loc, rel.type, *rel.sym);
-    if (rel.sym->isFunc() || rel.sym->isInPlt() ? !bit0Thumb : isBlx) {
+    if ((rel.sym->isFunc() || rel.sym->isInPlt()) ? !useThumb : isBlx) {
       // We are writing a BLX. Ensure BLX destination is 4-byte aligned. As
       // the BLX instruction may only be two byte aligned. This must be done
       // before overflow check.
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
@@ -212,6 +212,7 @@ struct Config {
   bool allowMultipleDefinition;
   bool fatLTOObjects;
   bool androidPackDynRelocs = false;
+  bool armThumbPLTs = false;
   bool armHasBlx = false;
   bool armHasMovtMovw = false;
   bool armJ1J2BranchEncoding = false;
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
@@ -194,6 +194,18 @@ static void updateSupportedARMFeatures(const ARMAttributeParser &attributes) {
   if (arch >= ARMBuildAttrs::CPUArch::v8_M_Base &&
       profile == ARMBuildAttrs::MicroControllerProfile)
     config->armCMSESupport = true;
+
+  // The thumb PLT entries require Thumb2 which can be used on multiple archs.
+  // For now, let's limit it to ones where ARM isn't available and we know have
+  // Thumb2.
+  std::optional<unsigned> armISA =
+      attributes.getAttributeValue(ARMBuildAttrs::ARM_ISA_use);
+  std::optional<unsigned> thumb =
+      attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
+  bool noArmISA = !armISA || *armISA == ARMBuildAttrs::Not_Allowed;
+  bool hasThumb2 = thumb && *thumb >= ARMBuildAttrs::AllowThumb32;
+  if (noArmISA && hasThumb2)
+    config->armThumbPLTs = true;
 }
 
 InputFile::InputFile(Kind k, MemoryBufferRef m)
diff --git a/lld/test/ELF/armv8-thumb-plt-reloc.s b/lld/test/ELF/armv8-thumb-plt-reloc.s
@@ -0,0 +1,121 @@
+// REQUIRES: arm
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2
+// RUN: ld.lld %t1 %t2 -o %t
+// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
+// RUN: ld.lld -shared %t1 %t2 -o %t.so
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readobj -S -r %t.so | FileCheck -check-prefix=DSOREL %s
+
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %p/Inputs/arm-plt-reloc.s -o %t1
+// RUN: llvm-mc -filetype=obj -arm-add-build-attributes --arch=thumb --mcpu=cortex-m33 %s -o %t2
+// RUN: ld.lld %t1 %t2 -o %t
+// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
+// RUN: ld.lld -shared %t1 %t2 -o %t.so
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readobj -S -r %t.so | FileCheck -check-prefix=DSOREL %s
+
+// RUN: ld.lld --be8 %t1 %t2 -o %t
+// RUN: llvm-objdump --no-print-imm-hex -d %t | FileCheck %s
+// RUN: ld.lld --be8 -shared %t1 %t2 -o %t.so
+// RUN: llvm-objdump --no-print-imm-hex -d %t.so | FileCheck --check-prefix=DSO %s
+// RUN: llvm-readobj -S -r %t.so | FileCheck -check-prefix=DSOREL %s
+
+// Test PLT entry generation
+ .text
+ .align 2
+ .globl _start
+ .type  _start,%function
+_start:
+// FIXME, interworking is only supported for BL via BLX at the moment, when
+// interworking thunks are available for b.w and b<cond>.w this can be altered
+// to test the different forms of interworking.
+ bl func1
+ bl func2
+ bl func3
+
+// Executable, expect no PLT
+// CHECK: Disassembly of section .text:
+// CHECK-EMPTY:
+// CHECK-NEXT: <func1>:
+// CHECK-NEXT:   200b4: 4770    bx      lr
+// CHECK: <func2>:
+// CHECK-NEXT:   200b6: 4770    bx      lr
+// CHECK: <func3>:
+// CHECK-NEXT:   200b8: 4770    bx      lr
+// CHECK-NEXT:   200ba: d4d4 
+// CHECK: <_start>:
+// CHECK-NEXT:   200bc: f7ff fffa       bl      0x200b4 <func1>
+// CHECK-NEXT:   200c0: f7ff fff9       bl      0x200b6 <func2>
+// CHECK-NEXT:   200c4: f7ff fff8       bl      0x200b8 <func3>
+
+// Expect PLT entries as symbols can be preempted
+// .text is Thumb and .plt is ARM, llvm-objdump can currently only disassemble
+// as ARM or Thumb. Work around by disassembling twice.
+// DSO: Disassembly of section .text:
+// DSO-EMPTY:
+// DSO-NEXT: <func1>:
+// DSO-NEXT:     10214:     4770    bx      lr
+// DSO: <func2>:
+// DSO-NEXT:     10216:     4770    bx      lr
+// DSO: <func3>:
+// DSO-NEXT:     10218:     4770    bx      lr
+// DSO-NEXT:     1021a:     d4d4 
+// DSO: <_start>:
+// 0x10250 = PLT func1
+// DSO-NEXT:     1021c:     f000 f818       bl     0x10250
+// 0x10260 = PLT func2
+// DSO-NEXT:     10220:     f000 f81e       bl     0x10260
+// 0x10270 = PLT func3
+// DSO-NEXT:     10224:     f000 f824       bl     0x10270
+// DSO: Disassembly of section .plt:
+// DSO-EMPTY:
+// DSO-NEXT: <.plt>:
+// DSO-NEXT:     10230: b500          push    {lr}
+// DSO-NEXT:     10232: f8df e008     ldr.w   lr, [pc, #8]
+// DSO-NEXT:     10236: 44fe          add     lr, pc
+// DSO-NEXT:     10238: f85e ff08     ldr     pc, [lr, #8]!
+// 0x20098 = .got.plt (0x302D8) - pc (0x10238 = .plt + 8) - 8
+// DSO-NEXT:     1023c: 98 00 02 00   .word   0x00020098
+// DSO-NEXT:     10240: d4 d4 d4 d4   .word   0xd4d4d4d4
+// DSO-NEXT:     10244: d4 d4 d4 d4   .word   0xd4d4d4d4
+// DSO-NEXT:     10248: d4 d4 d4 d4   .word   0xd4d4d4d4
+// DSO-NEXT:     1024c: d4 d4 d4 d4   .word   0xd4d4d4d4
+
+// 136 + 2 << 16 + 0x1025c = 0x302e4 = got entry 1
+// DSO-NEXT:     10250:       f240 0c88     movw    r12, #136
+// DSO-NEXT:     10254:       f2c0 0c02     movt    r12, #2
+// DSO-NEXT:     10258:       44fc          add     r12, pc
+// DSO-NEXT:     1025a:       f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:     1025e:       e7fc          b       0x1025a
+// 124 + 2 << 16 + 0x1026c = 0x302e8 = got entry 2
+// DSO-NEXT:     10260:       f240 0c7c     movw    r12, #124
+// DSO-NEXT:     10264:       f2c0 0c02     movt    r12, #2
+// DSO-NEXT:     10268:       44fc          add     r12, pc
+// DSO-NEXT:     1026a:       f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:     1026e:       e7fc          b       0x1026a
+// 112 + 2 << 16 + 0x1027c = 0x302ec = got entry 3
+// DSO-NEXT:     10270:       f240 0c70     movw    r12, #112
+// DSO-NEXT:     10274:       f2c0 0c02     movt    r12, #2
+// DSO-NEXT:     10278:       44fc          add     r12, pc
+// DSO-NEXT:     1027a:       f8dc f000     ldr.w   pc, [r12]
+// DSO-NEXT:     1027e:       e7fc          b       0x1027a
+
+// DSOREL:    Name: .got.plt
+// DSOREL-NEXT:    Type: SHT_PROGBITS
+// DSOREL-NEXT:    Flags [
+// DSOREL-NEXT:      SHF_ALLOC
+// DSOREL-NEXT:      SHF_WRITE
+// DSOREL-NEXT:    ]
+// DSOREL-NEXT:    Address: 0x302D8
+// DSOREL-NEXT:    Offset:
+// DSOREL-NEXT:    Size: 24
+// DSOREL-NEXT:    Link:
+// DSOREL-NEXT:    Info:
+// DSOREL-NEXT:    AddressAlignment: 4
+// DSOREL-NEXT:    EntrySize:
+// DSOREL:  Relocations [
+// DSOREL-NEXT:  Section (5) .rel.plt {
+// DSOREL-NEXT:    0x302E4 R_ARM_JUMP_SLOT func1
+// DSOREL-NEXT:    0x302E8 R_ARM_JUMP_SLOT func2
+// DSOREL-NEXT:    0x302EC R_ARM_JUMP_SLOT func3