[ARM] Shrink post-indexed LDR and STR to LDM/STM

James Molloy · James Molloy · commit 87f50aafbc14 · 2016-06-07T12:13:34.000Z
A Thumb-2 post-indexed LDR instruction such as: ldr.w r0, [r1], #4 Can be rewritten as: ldm.n r1!, {r0} LDMs can be more expensive than LDRs on some cores, so this has been enabled only in minsize mode. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272002 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -116,12 +116,14 @@ namespace {
   { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDR_POST,ARM::tLDMIA_UPD,0,         0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
   { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STR_POST,ARM::tSTMIA_UPD,0,         0,   0,   1,   0,  0,0, 0,1,0 },
 
   { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,   1,   1,  1,1, 0,1,0 },
   { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,   1,   1,  1,1, 0,1,0 },
@@ -423,6 +425,46 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     HasShift = true;
     OpNum = 4;
     break;
+  case ARM::t2LDR_POST:
+  case ARM::t2STR_POST: {
+    if (!MBB.getParent()->getFunction()->optForMinSize())
+      return false;
+
+    // We're creating a completely different type of load/store - LDM from LDR.
+    // For this reason we can't reuse the logic at the end of this function; we
+    // have to implement the MI building here.
+    bool IsStore = Entry.WideOpc == ARM::t2STR_POST;
+    unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
+    unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
+    unsigned Offset = MI->getOperand(3).getImm();
+    unsigned PredImm = MI->getOperand(4).getImm();
+    unsigned PredReg = MI->getOperand(5).getReg();
+    assert(isARMLowRegister(Rt));
+    assert(isARMLowRegister(Rn));
+
+    if (Offset != 4)
+      return false;
+
+    // Add the 16-bit load / store instruction.
+    DebugLoc dl = MI->getDebugLoc();
+    auto MIB = BuildMI(MBB, MI, dl, TII->get(Entry.NarrowOpc1))
+                   .addReg(Rn, RegState::Define)
+                   .addReg(Rn)
+                   .addImm(PredImm)
+                   .addReg(PredReg)
+                   .addReg(Rt, IsStore ? 0 : RegState::Define);
+
+    // Transfer memoperands.
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+    // Transfer MI flags.
+    MIB.setMIFlags(MI->getFlags());
+
+    // Kill the old instruction.
+    MI->eraseFromParent();
+    ++NumLdSts;
+    return true;
+  }
   case ARM::t2LDMIA: {
     unsigned BaseReg = MI->getOperand(0).getReg();
     assert(isARMLowRegister(BaseReg));
diff --git a/test/CodeGen/ARM/t2-shrink-ldrpost.ll b/test/CodeGen/ARM/t2-shrink-ldrpost.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7m--linux-gnu"
+
+; CHECK-LABEL: f:
+; CHECK: ldm	r{{[0-9]}}!, {r[[x:[0-9]]]}
+; CHECK: add.w	r[[x]], r[[x]], #3
+; CHECK: stm	r{{[0-9]}}!, {r[[x]]}
+define void @f(i32 %n, i32* nocapture %a, i32* nocapture readonly %b) optsize minsize {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %.lr.ph, %0
+  %i.04 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
+  %.03 = phi i32* [ %2, %.lr.ph ], [ %b, %0 ]
+  %.012 = phi i32* [ %5, %.lr.ph ], [ %a, %0 ]
+  %2 = getelementptr inbounds i32, i32* %.03, i32 1
+  %3 = load i32, i32* %.03, align 4
+  %4 = add nsw i32 %3, 3
+  %5 = getelementptr inbounds i32, i32* %.012, i32 1
+  store i32 %4, i32* %.012, align 4
+  %6 = add nsw i32 %i.04, 1
+  %exitcond = icmp eq i32 %6, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; CHECK-LABEL: f_nominsize:
+; CHECK-NOT: ldm
+define void @f_nominsize(i32 %n, i32* nocapture %a, i32* nocapture readonly %b) optsize {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %.lr.ph, %0
+  %i.04 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
+  %.03 = phi i32* [ %2, %.lr.ph ], [ %b, %0 ]
+  %.012 = phi i32* [ %5, %.lr.ph ], [ %a, %0 ]
+  %2 = getelementptr inbounds i32, i32* %.03, i32 1
+  %3 = load i32, i32* %.03, align 4
+  %4 = add nsw i32 %3, 3
+  %5 = getelementptr inbounds i32, i32* %.012, i32 1
+  store i32 %4, i32* %.012, align 4
+  %6 = add nsw i32 %i.04, 1
+  %exitcond = icmp eq i32 %6, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}