Skip to content

Commit c83f23d

Browse files
[AArch64] Fix heuristics for folding "lsl" into load/store ops. (#86894)
The existing heuristics were assuming that every core behaves like an Apple A7, where any extend/shift costs an extra micro-op... but in reality, nothing else behaves like that. On some older Cortex designs, shifts by 1 or 4 cost extra, but all other shifts/extensions are free. On all other cores, as far as I can tell, all shifts/extensions for integer loads are free (i.e. the same cost as an unshifted load). To reflect this, this patch: - Enables aggressive folding of shifts into loads by default. - Removes the old AddrLSLFast feature, since it applies to everything except A7 (and even if you are explicitly targeting A7, we want to assume extensions are free because the code will almost always run on a newer core). - Adds a new feature AddrLSLSlow14 that applies specifically to the Cortex cores where shifts by 1 or 4 cost extra. I didn't add support for AddrLSLSlow14 on the GlobalISel side because it would require a bunch of refactoring to work correctly. Someone can pick this up as a followup.
1 parent 53fe94a commit c83f23d

14 files changed

+119
-177
lines changed

llvm/lib/Target/AArch64/AArch64.td

Lines changed: 24 additions & 29 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
462462
SDValue &Offset, SDValue &SignExtend,
463463
SDValue &DoShift);
464464
bool isWorthFoldingALU(SDValue V, bool LSL = false) const;
465-
bool isWorthFoldingAddr(SDValue V) const;
465+
bool isWorthFoldingAddr(SDValue V, unsigned Size) const;
466466
bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
467467
SDValue &Offset, SDValue &SignExtend);
468468

@@ -674,17 +674,22 @@ static bool isWorthFoldingSHL(SDValue V) {
674674

675675
/// Determine whether it is worth to fold V into an extended register addressing
676676
/// mode.
677-
bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const {
677+
bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V, unsigned Size) const {
678678
// Trivial if we are optimizing for code size or if there is only
679679
// one use of the value.
680680
if (CurDAG->shouldOptForSize() || V.hasOneUse())
681681
return true;
682-
// If a subtarget has a fastpath LSL we can fold a logical shift into
683-
// the addressing mode and save a cycle.
684-
if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL &&
685-
isWorthFoldingSHL(V))
682+
683+
// If a subtarget has a slow shift, folding a shift into multiple loads
684+
// costs additional micro-ops.
685+
if (Subtarget->hasAddrLSLSlow14() && (Size == 2 || Size == 16))
686+
return false;
687+
688+
// Check whether we're going to emit the address arithmetic anyway because
689+
// it's used by a non-address operation.
690+
if (V.getOpcode() == ISD::SHL && isWorthFoldingSHL(V))
686691
return true;
687-
if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) {
692+
if (V.getOpcode() == ISD::ADD) {
688693
const SDValue LHS = V.getOperand(0);
689694
const SDValue RHS = V.getOperand(1);
690695
if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
@@ -1203,7 +1208,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
12031208
if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
12041209
return false;
12051210

1206-
return isWorthFoldingAddr(N);
1211+
return isWorthFoldingAddr(N, Size);
12071212
}
12081213

12091214
bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1231,7 +1236,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
12311236
}
12321237

12331238
// Remember if it is worth folding N when it produces extended register.
1234-
bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
1239+
bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
12351240

12361241
// Try to match a shifted extend on the RHS.
12371242
if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
@@ -1261,7 +1266,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
12611266
Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
12621267
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
12631268
MVT::i32);
1264-
if (isWorthFoldingAddr(LHS))
1269+
if (isWorthFoldingAddr(LHS, Size))
12651270
return true;
12661271
}
12671272

@@ -1273,7 +1278,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
12731278
Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
12741279
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
12751280
MVT::i32);
1276-
if (isWorthFoldingAddr(RHS))
1281+
if (isWorthFoldingAddr(RHS, Size))
12771282
return true;
12781283
}
12791284

@@ -1343,7 +1348,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
13431348
}
13441349

13451350
// Remember if it is worth folding N when it produces extended register.
1346-
bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
1351+
bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N, Size);
13471352

13481353
// Try to match a shifted extend on the RHS.
13491354
if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2993,7 +2993,7 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
29932993
return false;
29942994
Shift = AArch64_AM::getShiftValue(Shift);
29952995
if (!OptSize) {
2996-
if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
2996+
if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
29972997
return false;
29982998
if (avoidSlowSTRQ(MemI))
29992999
return false;

llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6907,10 +6907,8 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
69076907
MI.getParent()->getParent()->getFunction().hasOptSize())
69086908
return true;
69096909

6910-
// It's better to avoid folding and recomputing shifts when we don't have a
6911-
// fastpath.
6912-
if (!STI.hasAddrLSLFast())
6913-
return false;
6910+
// FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as
6911+
// appropriate.
69146912

69156913
// We have a fastpath, so folding a shift in and potentially computing it
69166914
// many times may be beneficial. Check if this is only used in memory ops.

llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
define void @mul_wrong_pow_2(ptr %addr) { ret void }
1616
define void @more_than_one_use_shl_1(ptr %addr) { ret void }
1717
define void @more_than_one_use_shl_2(ptr %addr) { ret void }
18-
define void @more_than_one_use_shl_lsl_fast(ptr %addr) #1 { ret void }
18+
define void @more_than_one_use_shl_lsl_fast(ptr %addr) { ret void }
1919
define void @more_than_one_use_shl_lsl_slow(ptr %addr) { ret void }
2020
define void @more_than_one_use_shl_minsize(ptr %addr) #0 { ret void }
2121
define void @ldrwrox(ptr %addr) { ret void }
@@ -24,7 +24,6 @@
2424
define void @ldbbrox(ptr %addr) { ret void }
2525
define void @ldrqrox(ptr %addr) { ret void }
2626
attributes #0 = { optsize }
27-
attributes #1 = { "target-features"="+addr-lsl-fast" }
2827
...
2928

3029
---
@@ -478,11 +477,10 @@ body: |
478477
; CHECK: liveins: $x0, $x1, $x2
479478
; CHECK-NEXT: {{ $}}
480479
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
481-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
482-
; CHECK-NEXT: [[ADDXrs:%[0-9]+]]:gpr64common = ADDXrs [[COPY1]], [[COPY]], 3
483-
; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
484-
; CHECK-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
485-
; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
480+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
481+
; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
482+
; CHECK-NEXT: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
483+
; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
486484
; CHECK-NEXT: $x2 = COPY [[ADDXrr]]
487485
; CHECK-NEXT: RET_ReallyLR implicit $x2
488486
%0:gpr(s64) = COPY $x0

llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll

Lines changed: 36 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
3-
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
2+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 | FileCheck %s --check-prefixes=CHECK,CHECK0
3+
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK3
44

55
%struct.a = type [256 x i16]
66
%struct.b = type [256 x i32]
@@ -49,36 +49,20 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
4949
}
5050

5151
define i32 @word(ptr %ctx, i32 %xor72) nounwind {
52-
; CHECK0-LABEL: word:
53-
; CHECK0: // %bb.0:
54-
; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
55-
; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1
56-
; CHECK0-NEXT: ubfx x8, x1, #9, #8
57-
; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
58-
; CHECK0-NEXT: mov x19, x0
59-
; CHECK0-NEXT: lsl x21, x8, #2
60-
; CHECK0-NEXT: ldr w20, [x0, x21]
61-
; CHECK0-NEXT: bl foo
62-
; CHECK0-NEXT: mov w0, w20
63-
; CHECK0-NEXT: str w20, [x19, x21]
64-
; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
65-
; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
66-
; CHECK0-NEXT: ret
67-
;
68-
; CHECK3-LABEL: word:
69-
; CHECK3: // %bb.0:
70-
; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
71-
; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1
72-
; CHECK3-NEXT: ubfx x21, x1, #9, #8
73-
; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
74-
; CHECK3-NEXT: mov x19, x0
75-
; CHECK3-NEXT: ldr w20, [x0, x21, lsl #2]
76-
; CHECK3-NEXT: bl foo
77-
; CHECK3-NEXT: mov w0, w20
78-
; CHECK3-NEXT: str w20, [x19, x21, lsl #2]
79-
; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
80-
; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
81-
; CHECK3-NEXT: ret
52+
; CHECK-LABEL: word:
53+
; CHECK: // %bb.0:
54+
; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
55+
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
56+
; CHECK-NEXT: ubfx x21, x1, #9, #8
57+
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
58+
; CHECK-NEXT: mov x19, x0
59+
; CHECK-NEXT: ldr w20, [x0, x21, lsl #2]
60+
; CHECK-NEXT: bl foo
61+
; CHECK-NEXT: mov w0, w20
62+
; CHECK-NEXT: str w20, [x19, x21, lsl #2]
63+
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
64+
; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
65+
; CHECK-NEXT: ret
8266
%shr81 = lshr i32 %xor72, 9
8367
%conv82 = zext i32 %shr81 to i64
8468
%idxprom83 = and i64 %conv82, 255
@@ -90,36 +74,20 @@ define i32 @word(ptr %ctx, i32 %xor72) nounwind {
9074
}
9175

9276
define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind {
93-
; CHECK0-LABEL: doubleword:
94-
; CHECK0: // %bb.0:
95-
; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
96-
; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1
97-
; CHECK0-NEXT: ubfx x8, x1, #9, #8
98-
; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
99-
; CHECK0-NEXT: mov x19, x0
100-
; CHECK0-NEXT: lsl x21, x8, #3
101-
; CHECK0-NEXT: ldr x20, [x0, x21]
102-
; CHECK0-NEXT: bl foo
103-
; CHECK0-NEXT: mov x0, x20
104-
; CHECK0-NEXT: str x20, [x19, x21]
105-
; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
106-
; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
107-
; CHECK0-NEXT: ret
108-
;
109-
; CHECK3-LABEL: doubleword:
110-
; CHECK3: // %bb.0:
111-
; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
112-
; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1
113-
; CHECK3-NEXT: ubfx x21, x1, #9, #8
114-
; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
115-
; CHECK3-NEXT: mov x19, x0
116-
; CHECK3-NEXT: ldr x20, [x0, x21, lsl #3]
117-
; CHECK3-NEXT: bl foo
118-
; CHECK3-NEXT: mov x0, x20
119-
; CHECK3-NEXT: str x20, [x19, x21, lsl #3]
120-
; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
121-
; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
122-
; CHECK3-NEXT: ret
77+
; CHECK-LABEL: doubleword:
78+
; CHECK: // %bb.0:
79+
; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
80+
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
81+
; CHECK-NEXT: ubfx x21, x1, #9, #8
82+
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
83+
; CHECK-NEXT: mov x19, x0
84+
; CHECK-NEXT: ldr x20, [x0, x21, lsl #3]
85+
; CHECK-NEXT: bl foo
86+
; CHECK-NEXT: mov x0, x20
87+
; CHECK-NEXT: str x20, [x19, x21, lsl #3]
88+
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
89+
; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
90+
; CHECK-NEXT: ret
12391
%shr81 = lshr i32 %xor72, 9
12492
%conv82 = zext i32 %shr81 to i64
12593
%idxprom83 = and i64 %conv82, 255
@@ -163,20 +131,12 @@ endbb:
163131
}
164132

165133
define i64 @gep3(ptr %p, i64 %b) {
166-
; CHECK0-LABEL: gep3:
167-
; CHECK0: // %bb.0:
168-
; CHECK0-NEXT: lsl x9, x1, #3
169-
; CHECK0-NEXT: mov x8, x0
170-
; CHECK0-NEXT: ldr x0, [x0, x9]
171-
; CHECK0-NEXT: str x1, [x8, x9]
172-
; CHECK0-NEXT: ret
173-
;
174-
; CHECK3-LABEL: gep3:
175-
; CHECK3: // %bb.0:
176-
; CHECK3-NEXT: mov x8, x0
177-
; CHECK3-NEXT: ldr x0, [x0, x1, lsl #3]
178-
; CHECK3-NEXT: str x1, [x8, x1, lsl #3]
179-
; CHECK3-NEXT: ret
134+
; CHECK-LABEL: gep3:
135+
; CHECK: // %bb.0:
136+
; CHECK-NEXT: mov x8, x0
137+
; CHECK-NEXT: ldr x0, [x0, x1, lsl #3]
138+
; CHECK-NEXT: str x1, [x8, x1, lsl #3]
139+
; CHECK-NEXT: ret
180140
%g = getelementptr inbounds i64, ptr %p, i64 %b
181141
%l = load i64, ptr %g
182142
store i64 %b, ptr %g

llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,8 @@ define void @test8(i64 %a, ptr noalias %src, ptr noalias %dst, i64 %n) {
134134
; CHECK-NEXT: b.hs .LBB7_1
135135
; CHECK-NEXT: // %bb.3: // %if.then
136136
; CHECK-NEXT: // in Loop: Header=BB7_2 Depth=1
137-
; CHECK-NEXT: lsl x10, x8, #3
138-
; CHECK-NEXT: ldr x11, [x1, x10]
139-
; CHECK-NEXT: str x11, [x2, x10]
137+
; CHECK-NEXT: ldr x10, [x1, x8, lsl #3]
138+
; CHECK-NEXT: str x10, [x2, x8, lsl #3]
140139
; CHECK-NEXT: b .LBB7_1
141140
; CHECK-NEXT: .LBB7_4: // %exit
142141
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ return: ; preds = %if.end23, %if.then3
125125
}
126126

127127
; CHECK: @test
128-
; CHECK-NOT: , uxtw #2]
128+
; CHECK: , uxtw #2]
129129
define i32 @test(ptr %array, i8 zeroext %c, i32 %arg) {
130130
entry:
131131
%conv = zext i8 %c to i32

llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,10 @@ define void @fct1_64x1(ptr nocapture %array, i64 %offset) nounwind ssp {
201201
; CHECK-LABEL: fct1_64x1:
202202
; CHECK: // %bb.0: // %entry
203203
; CHECK-NEXT: adrp x8, :got:globalArray64x1
204-
; CHECK-NEXT: lsl x9, x1, #3
205204
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray64x1]
206-
; CHECK-NEXT: ldr d0, [x0, x9]
205+
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
207206
; CHECK-NEXT: ldr x8, [x8]
208-
; CHECK-NEXT: str d0, [x8, x9]
207+
; CHECK-NEXT: str d0, [x8, x1, lsl #3]
209208
; CHECK-NEXT: ret
210209
entry:
211210
%arrayidx = getelementptr inbounds <1 x i64>, ptr %array, i64 %offset
@@ -238,11 +237,10 @@ define void @fct1_32x2(ptr nocapture %array, i64 %offset) nounwind ssp {
238237
; CHECK-LABEL: fct1_32x2:
239238
; CHECK: // %bb.0: // %entry
240239
; CHECK-NEXT: adrp x8, :got:globalArray32x2
241-
; CHECK-NEXT: lsl x9, x1, #3
242240
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray32x2]
243-
; CHECK-NEXT: ldr d0, [x0, x9]
241+
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
244242
; CHECK-NEXT: ldr x8, [x8]
245-
; CHECK-NEXT: str d0, [x8, x9]
243+
; CHECK-NEXT: str d0, [x8, x1, lsl #3]
246244
; CHECK-NEXT: ret
247245
entry:
248246
%arrayidx = getelementptr inbounds <2 x i32>, ptr %array, i64 %offset
@@ -275,11 +273,10 @@ define void @fct1_16x4(ptr nocapture %array, i64 %offset) nounwind ssp {
275273
; CHECK-LABEL: fct1_16x4:
276274
; CHECK: // %bb.0: // %entry
277275
; CHECK-NEXT: adrp x8, :got:globalArray16x4
278-
; CHECK-NEXT: lsl x9, x1, #3
279276
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray16x4]
280-
; CHECK-NEXT: ldr d0, [x0, x9]
277+
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
281278
; CHECK-NEXT: ldr x8, [x8]
282-
; CHECK-NEXT: str d0, [x8, x9]
279+
; CHECK-NEXT: str d0, [x8, x1, lsl #3]
283280
; CHECK-NEXT: ret
284281
entry:
285282
%arrayidx = getelementptr inbounds <4 x i16>, ptr %array, i64 %offset
@@ -312,11 +309,10 @@ define void @fct1_8x8(ptr nocapture %array, i64 %offset) nounwind ssp {
312309
; CHECK-LABEL: fct1_8x8:
313310
; CHECK: // %bb.0: // %entry
314311
; CHECK-NEXT: adrp x8, :got:globalArray8x8
315-
; CHECK-NEXT: lsl x9, x1, #3
316312
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray8x8]
317-
; CHECK-NEXT: ldr d0, [x0, x9]
313+
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
318314
; CHECK-NEXT: ldr x8, [x8]
319-
; CHECK-NEXT: str d0, [x8, x9]
315+
; CHECK-NEXT: str d0, [x8, x1, lsl #3]
320316
; CHECK-NEXT: ret
321317
entry:
322318
%arrayidx = getelementptr inbounds <8 x i8>, ptr %array, i64 %offset

llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,12 @@ define void @avoid_promotion_2_and(ptr nocapture noundef %arg) {
8282
; CHECK-NEXT: eor w10, w10, w11
8383
; CHECK-NEXT: ldur w11, [x8, #-24]
8484
; CHECK-NEXT: and w10, w10, w14
85-
; CHECK-NEXT: ldp x15, x14, [x8, #-16]
86-
; CHECK-NEXT: ubfiz x13, x10, #1, #32
85+
; CHECK-NEXT: ldp x14, x13, [x8, #-16]
8786
; CHECK-NEXT: str w10, [x8]
88-
; CHECK-NEXT: and w10, w11, w12
89-
; CHECK-NEXT: ldrh w11, [x14, x13]
90-
; CHECK-NEXT: strh w11, [x15, w10, uxtw #1]
91-
; CHECK-NEXT: strh w12, [x14, x13]
87+
; CHECK-NEXT: and w11, w11, w12
88+
; CHECK-NEXT: ldrh w15, [x13, w10, uxtw #1]
89+
; CHECK-NEXT: strh w15, [x14, w11, uxtw #1]
90+
; CHECK-NEXT: strh w12, [x13, w10, uxtw #1]
9291
; CHECK-NEXT: b LBB1_1
9392
; CHECK-NEXT: LBB1_4: ; %exit
9493
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)