[ARM] Prefer MUL to MULS on some implementations (#112540)

VladiKrapp-Arm · web-flow · commit ea796e5237af · 2024-10-17T13:53:22.000+01:00
MULS adversely affects performance on many implementations. Where this
is the case, we prefer not to shrink MUL to MULS.
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -398,6 +398,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
                                                "AvoidCPSRPartialUpdate", "true",
                                  "Avoid CPSR partial update for OOO execution">;
 
+/// FeatureAvoidMULS - If true, codegen would avoid using the MULS instruction,
+/// prefering the thumb2 MUL which doesn't set flags.
+def FeatureAvoidMULS : SubtargetFeature<"avoid-muls",
+                                        "AvoidMULS", "true",
+                                 "Avoid MULS instructions for M class cores">;
+
+
 /// Disable +1 predication cost for instructions updating CPSR.
 /// Enabled for Cortex-A57.
 /// True if disable +1 predication cost for instructions updating CPSR. Enabled for Cortex-A57.
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -360,6 +360,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model,       [ARMv8mMainline,
                                                          FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor,
+                                                         FeatureAvoidMULS,
                                                          FeatureFixCMSE_CVE_2021_35465]>;
 
 def : ProcessorModel<"star-mc1", CortexM4Model,         [ARMv8mMainline,
diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -755,6 +755,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   Register Reg1 = MI->getOperand(1).getReg();
   // t2MUL is "special". The tied source operand is second, not first.
   if (MI->getOpcode() == ARM::t2MUL) {
+    // MULS can be slower than MUL
+    if (!MinimizeSize && STI->avoidMULS())
+      return false;
     Register Reg2 = MI->getOperand(2).getReg();
     // Early exit if the regs aren't all low regs.
     if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
diff --git a/llvm/test/CodeGen/Thumb2/avoidmuls.mir b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
@@ -1,67 +1,20 @@
-# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s
+# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m33 -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MUL
+# RUN: llc -mtriple=thumbv7m-none-eabi --run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MULS
 
---- |
-  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-  target triple = "thumbv8m.main-arm-none-eabi"
-
-  ; Function Attrs: norecurse nounwind readnone
-  define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
-  entry:
-    %cmp6 = icmp sgt i32 %y, 0
-    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-  for.body.preheader:                               ; preds = %entry
-    br label %for.body
-
-  for.cond.cleanup:                                 ; preds = %for.body, %entry
-    %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
-    ret i32 %sum.0.lcssa
-
-  for.body:                                         ; preds = %for.body, %for.body.preheader
-    %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
-    %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
-    %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
-    %mul = mul nsw i32 %lsr.iv1, %sum.07
-    %lsr.iv.next = add i32 %lsr.iv, -1
-    %lsr.iv.next2 = add i32 %lsr.iv1, 1
-    %exitcond = icmp eq i32 %lsr.iv.next, 0
-    br i1 %exitcond, label %for.cond.cleanup, label %for.body
-  }
-
-  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m33" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-...
 ---
 name:            test
-tracksRegLiveness: true
-liveins:
-  - { reg: '$r0', virtual-reg: '' }
-  - { reg: '$r1', virtual-reg: '' }
 body:             |
-  bb.0.entry:
-    successors: %bb.1.for.body, %bb.2.for.cond.cleanup
-    liveins: $r0, $r1
-
+  bb.0:
     $r2 = tMOVr $r0, 14, _
     $r0 = t2MOVi 1, 14, _, _
-    t2CMPri $r1, 1, 14, _, implicit-def $cpsr
-    t2Bcc %bb.2.for.cond.cleanup, 11, killed $cpsr
-
-  bb.1.for.body:
-    successors: %bb.2.for.cond.cleanup, %bb.1.for.body
-    liveins: $r0, $r1, $r2
-
     $r0 = t2MUL $r2, killed $r0, 14, _
-    $r2 = t2ADDri killed $r2, 1, 14, _, _
-    $r1 = t2SUBri killed $r1, 1, 14, _, def $cpsr
-    t2Bcc %bb.1.for.body, 1, killed $cpsr
-
-  bb.2.for.cond.cleanup:
-    liveins: $r0
-
     tBX_RET 14, _, implicit $r0
 
 ...
-# CHECK-LABEL: test
-# CHECK: tMUL
-# CHECK-NOT: t2MUL
+# MUL-LABEL: test
+# MUL: t2MUL
+# MUL-NOT: tMUL
+
+# MULS-LABEL: test
+# MULS: tMUL
+# MULS-NOT: t2MUL