Skip to content

Commit ea796e5

Browse files
[ARM] Prefer MUL to MULS on some implementations (#112540)
MULS adversely affects performance on many implementations. Where this is the case, we prefer not to shrink MUL to MULS.
1 parent 388d7f1 commit ea796e5

File tree

4 files changed

+21
-57
lines changed

4 files changed

+21
-57
lines changed

llvm/lib/Target/ARM/ARMFeatures.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
398398
"AvoidCPSRPartialUpdate", "true",
399399
"Avoid CPSR partial update for OOO execution">;
400400

401+
/// FeatureAvoidMULS - If true, codegen would avoid using the MULS instruction,
402+
/// prefering the thumb2 MUL which doesn't set flags.
403+
def FeatureAvoidMULS : SubtargetFeature<"avoid-muls",
404+
"AvoidMULS", "true",
405+
"Avoid MULS instructions for M class cores">;
406+
407+
401408
/// Disable +1 predication cost for instructions updating CPSR.
402409
/// Enabled for Cortex-A57.
403410
/// True if disable +1 predication cost for instructions updating CPSR. Enabled for Cortex-A57.

llvm/lib/Target/ARM/ARMProcessors.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline,
360360
FeatureHasSlowFPVFMx,
361361
FeatureUseMISched,
362362
FeatureHasNoBranchPredictor,
363+
FeatureAvoidMULS,
363364
FeatureFixCMSE_CVE_2021_35465]>;
364365

365366
def : ProcessorModel<"star-mc1", CortexM4Model, [ARMv8mMainline,

llvm/lib/Target/ARM/Thumb2SizeReduction.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
755755
Register Reg1 = MI->getOperand(1).getReg();
756756
// t2MUL is "special". The tied source operand is second, not first.
757757
if (MI->getOpcode() == ARM::t2MUL) {
758+
// MULS can be slower than MUL
759+
if (!MinimizeSize && STI->avoidMULS())
760+
return false;
758761
Register Reg2 = MI->getOperand(2).getReg();
759762
// Early exit if the regs aren't all low regs.
760763
if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
Lines changed: 10 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,20 @@
1-
# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s
1+
# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m33 -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MUL
2+
# RUN: llc -mtriple=thumbv7m-none-eabi --run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MULS
23

3-
--- |
4-
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
5-
target triple = "thumbv8m.main-arm-none-eabi"
6-
7-
; Function Attrs: norecurse nounwind readnone
8-
define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
9-
entry:
10-
%cmp6 = icmp sgt i32 %y, 0
11-
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
12-
13-
for.body.preheader: ; preds = %entry
14-
br label %for.body
15-
16-
for.cond.cleanup: ; preds = %for.body, %entry
17-
%sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
18-
ret i32 %sum.0.lcssa
19-
20-
for.body: ; preds = %for.body, %for.body.preheader
21-
%lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
22-
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
23-
%sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
24-
%mul = mul nsw i32 %lsr.iv1, %sum.07
25-
%lsr.iv.next = add i32 %lsr.iv, -1
26-
%lsr.iv.next2 = add i32 %lsr.iv1, 1
27-
%exitcond = icmp eq i32 %lsr.iv.next, 0
28-
br i1 %exitcond, label %for.cond.cleanup, label %for.body
29-
}
30-
31-
attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m33" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
32-
33-
...
344
---
355
name: test
36-
tracksRegLiveness: true
37-
liveins:
38-
- { reg: '$r0', virtual-reg: '' }
39-
- { reg: '$r1', virtual-reg: '' }
406
body: |
41-
bb.0.entry:
42-
successors: %bb.1.for.body, %bb.2.for.cond.cleanup
43-
liveins: $r0, $r1
44-
7+
bb.0:
458
$r2 = tMOVr $r0, 14, _
469
$r0 = t2MOVi 1, 14, _, _
47-
t2CMPri $r1, 1, 14, _, implicit-def $cpsr
48-
t2Bcc %bb.2.for.cond.cleanup, 11, killed $cpsr
49-
50-
bb.1.for.body:
51-
successors: %bb.2.for.cond.cleanup, %bb.1.for.body
52-
liveins: $r0, $r1, $r2
53-
5410
$r0 = t2MUL $r2, killed $r0, 14, _
55-
$r2 = t2ADDri killed $r2, 1, 14, _, _
56-
$r1 = t2SUBri killed $r1, 1, 14, _, def $cpsr
57-
t2Bcc %bb.1.for.body, 1, killed $cpsr
58-
59-
bb.2.for.cond.cleanup:
60-
liveins: $r0
61-
6211
tBX_RET 14, _, implicit $r0
6312
6413
...
65-
# CHECK-LABEL: test
66-
# CHECK: tMUL
67-
# CHECK-NOT: t2MUL
14+
# MUL-LABEL: test
15+
# MUL: t2MUL
16+
# MUL-NOT: tMUL
17+
18+
# MULS-LABEL: test
19+
# MULS: tMUL
20+
# MULS-NOT: t2MUL

0 commit comments

Comments
 (0)