llvm · ostannard · Aug 9, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3024,18 +3024,27 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
 
   assert(Subtarget->supportsTailCall());
 
-  // Indirect tail calls cannot be optimized for Thumb1 if the args
-  // to the call take up r0-r3. The reason is that there are no legal registers
-  // left to hold the pointer to the function to be called.
-  // Similarly, if the function uses return address sign and authentication,
-  // r12 is needed to hold the PAC and is not available to hold the callee
-  // address.
-  if (Outs.size() >= 4 &&
-      (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
-    if (Subtarget->isThumb1Only())
-      return false;
-    // Conservatively assume the function spills LR.
-    if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))
+  // Indirect tail-calls require a register to hold the target address. That
+  // register must be:
+  // * Allocatable (i.e. r0-r7 if the target is Thumb1).
+  // * Not callee-saved, so must be one of r0-r3 or r12.
+  // * Not used to hold an argument to the tail-called function, which might be
+  //   in r0-r3.
+  // * Not used to hold the return address authentication code, which is in r12
+  //   if enabled.
+  // Sometimes, no register matches all of these conditions, so we can't do a
+  // tail-call.
+  if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
+    SmallSet<MCPhysReg, 5> AddressRegisters;
+    for (Register R : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
+      AddressRegisters.insert(R);
+    if (!(Subtarget->isThumb1Only() ||
+          MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
+      AddressRegisters.insert(ARM::R12);
+    for (const CCValAssign &AL : ArgLocs)
+      if (AL.isRegLoc())
+        AddressRegisters.erase(AL.getLocReg());
+    if (AddressRegisters.empty())
       return false;
   }
 

diff --git a/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
@@ -0,0 +1,111 @@
+; RUN: llc %s -o - -mtriple=thumbv8m.main -mattr=+vfp4 | FileCheck %s
+
+;; No outgoing arguments, plenty of free registers to hold the target address.
+define void @test0(ptr %fptr) {
+; CHECK-LABEL: test0:
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+entry:
+  tail call void %fptr()
+  ret void
+}
+
+;; Four integer outgoing arguments, which use up r0-r3.
+define void @test1(ptr %fptr) {
+; CHECK-LABEL: test1:
+; CHECK: bx r12
+entry:
+  tail call void %fptr(i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+;; Four integer outgoing arguments, which use up r0-r3, and sign-return-address
+;; uses r12, so we can never tail-call this.
+define void @test2(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test2:
+; CHECK: blx
+  entry:
+  tail call void %fptr(i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+;; An i32 and an i64 argument, which uses r0, r2 and r3 for arguments, leaving
+;; r1 free for the address.
+define void @test3(ptr %fptr) {
+; CHECK-LABEL: test3:
+; CHECK: bx {{r1|r12}}
+entry:
+  tail call void %fptr(i32 0, i64 0)
+  ret void
+}
+
+;; Four float arguments, using the soft-float calling convention, which uses
+;; r0-r3.
+define void @test4(ptr %fptr) {
+; CHECK-LABEL: test4:
+; CHECK: bx r12
+entry:
+  tail call arm_aapcscc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the soft-float calling convention, which uses
+;; r0-r3, and sign-return-address uses r12. Currently fails with "ran out of
+;; registers during register allocation".
+define void @test5(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test5:
+; CHECK: blx
+entry:
+  tail call arm_aapcscc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the hard-float calling convention, which uses
+;; s0-s3, leaving the all of the integer registers free for the address.
+define void @test6(ptr %fptr) {
+; CHECK-LABEL: test6:
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+entry:
+  tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the hard-float calling convention, which uses
+;; s0-s3, leaving r0-r3 free for the address, with r12 used for
+;; sign-return-address.
+define void @test7(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test7:
+; CHECK: bx {{r0|r1|r2|r3}}
+entry:
+  tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Two double arguments, using the soft-float calling convention, which uses
+;; r0-r3.
+define void @test8(ptr %fptr) {
+; CHECK-LABEL: test8:
+; CHECK: bx r12
+entry:
+  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+  ret void
+}
+
+;; Two double arguments, using the soft-float calling convention, which uses
+;; r0-r3, and sign-return-address uses r12, so we can't tail-call this.
+define void @test9(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test9:
+; CHECK: blx
+entry:
+  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+  ret void
+}
+
+;; Four integer arguments (one on the stack), but dut to alignment r1 is left
+;; empty, so can be used for the tail-call.
+define void @test10(ptr %fptr, i64 %b, i32 %c) "sign-return-address"="all" {
+; CHECK-LABEL: test10:
+; CHECK: bx r1
+entry:
+  tail call void %fptr(i32 0, i64 %b, i32 %c)
+  ret void
+}