Skip to content

Commit e0da547

Browse files
committed
[AArch64] Optimize when storing symmetry constants
This change looks for instructions of storing symmetric constants instruction 32-bit units. usually consisting of several 'MOV' and one or less 'ORR'. If found, load only the lower 32-bit constant and change it to copy and save to the upper 32-bit using the 'STP' instruction. For example: renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = ORRXrs $x8, $x8, 32 STRXui killed renamable $x8, killed renamable $x0, 0 becomes $w8 = MOVZWi 49370, 0 $w8 = MOVKWi $w8, 320, 16 STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
1 parent 08fe3b9 commit e0da547

File tree

3 files changed

+191
-26
lines changed

3 files changed

+191
-26
lines changed

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,14 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
226226
// Find and merge an index ldr/st instruction into a base ld/st instruction.
227227
bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
228228

229+
// Finds and collapses loads of symmetric constant value.
230+
bool tryFoldSymmetryConstantLoad(MachineBasicBlock::iterator &I,
231+
unsigned Limit);
232+
MachineBasicBlock::iterator
233+
doFoldSymmetryConstantLoad(MachineInstr &MI,
234+
SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
235+
int UpperLoadIdx, int Accumulated);
236+
229237
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
230238

231239
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2443,6 +2451,155 @@ AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
24432451
return E;
24442452
}
24452453

2454+
static bool isSymmetricLoadCandidate(MachineInstr &MI, Register BaseReg) {
2455+
auto MatchBaseReg = [&](unsigned Count) {
2456+
for (unsigned I = 0; I < Count; I++) {
2457+
auto OpI = MI.getOperand(I);
2458+
if (OpI.isReg() && OpI.getReg() != BaseReg)
2459+
return false;
2460+
}
2461+
return true;
2462+
};
2463+
2464+
unsigned Opc = MI.getOpcode();
2465+
switch (Opc) {
2466+
default:
2467+
return false;
2468+
case AArch64::MOVZXi:
2469+
return MatchBaseReg(1);
2470+
case AArch64::MOVKXi:
2471+
return MatchBaseReg(2);
2472+
case AArch64::ORRXrs:
2473+
MachineOperand &Imm = MI.getOperand(3);
2474+
// Fourth operand of ORR must be 32 which mean
2475+
// 32bit symmetric constant load.
2476+
// ex) renamable $x8 = ORRXrs $x8, $x8, 32
2477+
if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == 32)
2478+
return true;
2479+
}
2480+
2481+
return false;
2482+
}
2483+
2484+
MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad(
2485+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2486+
int UpperLoadIdx, int Accumulated) {
2487+
MachineBasicBlock::iterator I = MI.getIterator();
2488+
MachineBasicBlock::iterator E = I->getParent()->end();
2489+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2490+
MachineBasicBlock *MBB = MI.getParent();
2491+
2492+
if (!UpperLoadIdx) {
2493+
// ORR ensures that previous instructions load lower 32-bit constants.
2494+
// Remove ORR only.
2495+
(*MIs.begin())->eraseFromParent();
2496+
} else {
2497+
// We need to remove MOV for upper of 32bit because we know these instrs
2498+
// is part of symmetric constant.
2499+
int Index = 0;
2500+
for (auto MI = MIs.begin(); Index < UpperLoadIdx; ++MI, Index++) {
2501+
(*MI)->eraseFromParent();
2502+
}
2503+
}
2504+
2505+
Register BaseReg = getLdStRegOp(MI).getReg();
2506+
const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
2507+
Register DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
2508+
unsigned DstRegState = getRegState(MI.getOperand(0));
2509+
int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2510+
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
2511+
.addReg(DstRegW, DstRegState)
2512+
.addReg(DstRegW, DstRegState)
2513+
.addReg(MO.getReg(), getRegState(MO))
2514+
.addImm(Offset * 2)
2515+
.setMemRefs(MI.memoperands())
2516+
.setMIFlags(MI.getFlags());
2517+
I->eraseFromParent();
2518+
return NextI;
2519+
}
2520+
2521+
bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad(
2522+
MachineBasicBlock::iterator &I, unsigned Limit) {
2523+
MachineInstr &MI = *I;
2524+
if (MI.getOpcode() != AArch64::STRXui)
2525+
return false;
2526+
2527+
MachineBasicBlock::iterator MBBI = I;
2528+
MachineBasicBlock::iterator B = I->getParent()->begin();
2529+
if (MBBI == B)
2530+
return false;
2531+
2532+
TypeSize Scale(0U, false), Width(0U, false);
2533+
int64_t MinOffset, MaxOffset;
2534+
if (!AArch64InstrInfo::getMemOpInfo(AArch64::STPWi, Scale, Width, MinOffset,
2535+
MaxOffset))
2536+
return false;
2537+
2538+
// We replace the STRX instruction, which stores 64 bits, with the STPW
2539+
// instruction, which stores two consecutive 32 bits. Therefore, we compare
2540+
// the offset range with multiplied by two.
2541+
int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2542+
if (Offset * 2 < MinOffset || Offset * 2 > MaxOffset)
2543+
return false;
2544+
2545+
Register BaseReg = getLdStRegOp(MI).getReg();
2546+
unsigned Count = 0, UpperLoadIdx = 0;
2547+
uint64_t Accumulated = 0, Mask = 0xFFFFUL;
2548+
bool hasORR = false, Found = false;
2549+
SmallVector<MachineBasicBlock::iterator> MIs;
2550+
ModifiedRegUnits.clear();
2551+
UsedRegUnits.clear();
2552+
do {
2553+
MBBI = prev_nodbg(MBBI, B);
2554+
MachineInstr &MI = *MBBI;
2555+
if (!MI.isTransient())
2556+
++Count;
2557+
if (!isSymmetricLoadCandidate(MI, BaseReg)) {
2558+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2559+
TRI);
2560+
if (!ModifiedRegUnits.available(BaseReg) ||
2561+
!UsedRegUnits.available(BaseReg))
2562+
return false;
2563+
continue;
2564+
}
2565+
2566+
unsigned Opc = MI.getOpcode();
2567+
if (Opc == AArch64::ORRXrs) {
2568+
hasORR = true;
2569+
MIs.push_back(MBBI);
2570+
continue;
2571+
}
2572+
unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
2573+
MachineOperand Value = MI.getOperand(ValueOrder);
2574+
MachineOperand Shift = MI.getOperand(ValueOrder + 1);
2575+
if (!Value.isImm() || !Shift.isImm())
2576+
return false;
2577+
2578+
uint64_t IValue = Value.getImm();
2579+
uint64_t IShift = Shift.getImm();
2580+
uint64_t Adder = IValue << IShift;
2581+
MIs.push_back(MBBI);
2582+
if (Adder >> 32)
2583+
UpperLoadIdx = MIs.size();
2584+
2585+
Accumulated -= Accumulated & (Mask << IShift);
2586+
Accumulated += Adder;
2587+
if (Accumulated != 0 &&
2588+
(((Accumulated >> 32) == (Accumulated & 0xffffffffULL)) ||
2589+
(hasORR && (Accumulated >> 32 == 0)))) {
2590+
Found = true;
2591+
break;
2592+
}
2593+
} while (MBBI != B && Count < Limit);
2594+
2595+
if (Found) {
2596+
I = doFoldSymmetryConstantLoad(MI, MIs, UpperLoadIdx, Accumulated);
2597+
return true;
2598+
}
2599+
2600+
return false;
2601+
}
2602+
24462603
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
24472604
MachineBasicBlock::iterator &MBBI) {
24482605
MachineInstr &MI = *MBBI;
@@ -2753,6 +2910,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
27532910
++MBBI;
27542911
}
27552912

2913+
// We have an opportunity to optimize the `STRXui` instruction, which loads
2914+
// the same 32-bit value into a register twice. The `STPXi` instruction allows
2915+
// us to load a 32-bit value only once.
2916+
// Considering :
2917+
// renamable $x8 = MOVZXi 49370, 0
2918+
// renamable $x8 = MOVKXi $x8, 320, 16
2919+
// renamable $x8 = ORRXrs $x8, $x8, 32
2920+
// STRXui killed renamable $x8, killed renamable $x0, 0
2921+
// Transform :
2922+
// $w8 = MOVZWi 49370, 0
2923+
// $w8 = MOVKWi $w8, 320, 16
2924+
// STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
2925+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2926+
MBBI != E;) {
2927+
if (isMergeableLdStUpdate(*MBBI) &&
2928+
tryFoldSymmetryConstantLoad(MBBI, UpdateLimit))
2929+
Modified = true;
2930+
else
2931+
++MBBI;
2932+
}
2933+
27562934
return Modified;
27572935
}
27582936

llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,7 @@ define void @test_store_0x1234567812345678(ptr %x) {
9999
; CHECK: // %bb.0:
100100
; CHECK-NEXT: mov x8, #22136 // =0x5678
101101
; CHECK-NEXT: movk x8, #4660, lsl #16
102-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
103-
; CHECK-NEXT: str x8, [x0]
102+
; CHECK-NEXT: stp w8, w8, [x0]
104103
; CHECK-NEXT: ret
105104
store i64 u0x1234567812345678, ptr %x
106105
ret void
@@ -111,8 +110,7 @@ define void @test_store_0xff3456ffff3456ff(ptr %x) {
111110
; CHECK: // %bb.0:
112111
; CHECK-NEXT: mov x8, #22271 // =0x56ff
113112
; CHECK-NEXT: movk x8, #65332, lsl #16
114-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
115-
; CHECK-NEXT: str x8, [x0]
113+
; CHECK-NEXT: stp w8, w8, [x0]
116114
; CHECK-NEXT: ret
117115
store i64 u0xff3456ffff3456ff, ptr %x
118116
ret void
@@ -166,8 +164,7 @@ define void @test_store_0x0000555500005555(ptr %x) {
166164
; CHECK-LABEL: test_store_0x0000555500005555:
167165
; CHECK: // %bb.0:
168166
; CHECK-NEXT: mov x8, #21845 // =0x5555
169-
; CHECK-NEXT: movk x8, #21845, lsl #32
170-
; CHECK-NEXT: str x8, [x0]
167+
; CHECK-NEXT: stp w8, w8, [x0]
171168
; CHECK-NEXT: ret
172169
store i64 u0x0000555500005555, ptr %x
173170
ret void
@@ -177,8 +174,7 @@ define void @test_store_0x5555000055550000(ptr %x) {
177174
; CHECK-LABEL: test_store_0x5555000055550000:
178175
; CHECK: // %bb.0:
179176
; CHECK-NEXT: mov x8, #1431633920 // =0x55550000
180-
; CHECK-NEXT: movk x8, #21845, lsl #48
181-
; CHECK-NEXT: str x8, [x0]
177+
; CHECK-NEXT: stp w8, w8, [x0]
182178
; CHECK-NEXT: ret
183179
store i64 u0x5555000055550000, ptr %x
184180
ret void
@@ -234,8 +230,7 @@ define void @test_store_0x1234567812345678_offset_range(ptr %x) {
234230
; CHECK: // %bb.0:
235231
; CHECK-NEXT: mov x8, #22136 // =0x5678
236232
; CHECK-NEXT: movk x8, #4660, lsl #16
237-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
238-
; CHECK-NEXT: str x8, [x0, #32]
233+
; CHECK-NEXT: stp w8, w8, [x0, #32]
239234
; CHECK-NEXT: ret
240235
%g = getelementptr i64, ptr %x, i64 4
241236
store i64 u0x1234567812345678, ptr %g
@@ -247,8 +242,7 @@ define void @test_store_0x1234567812345678_offset_min(ptr %x) {
247242
; CHECK: // %bb.0:
248243
; CHECK-NEXT: mov x8, #22136 // =0x5678
249244
; CHECK-NEXT: movk x8, #4660, lsl #16
250-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
251-
; CHECK-NEXT: str x8, [x0]
245+
; CHECK-NEXT: stp w8, w8, [x0]
252246
; CHECK-NEXT: ret
253247
%g = getelementptr i8, ptr %x, i32 0
254248
store i64 u0x1234567812345678, ptr %g
@@ -260,8 +254,7 @@ define void @test_store_0x1234567812345678_offset_max(ptr %x) {
260254
; CHECK: // %bb.0:
261255
; CHECK-NEXT: mov x8, #22136 // =0x5678
262256
; CHECK-NEXT: movk x8, #4660, lsl #16
263-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
264-
; CHECK-NEXT: str x8, [x0, #248]
257+
; CHECK-NEXT: stp w8, w8, [x0, #248]
265258
; CHECK-NEXT: ret
266259
%g = getelementptr i8, ptr %x, i32 248
267260
store i64 u0x1234567812345678, ptr %g

llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@ body: |
4444
; CHECK-NEXT: {{ $}}
4545
; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
4646
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
47-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
48-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
47+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
4948
; CHECK-NEXT: RET undef $lr
5049
renamable $x8 = MOVi64imm 90284035103834330
5150
STRXui killed renamable $x8, killed renamable $x0, 0
@@ -62,8 +61,7 @@ body: |
6261
; CHECK-NEXT: {{ $}}
6362
; CHECK-NEXT: renamable $x8 = MOVZXi 320, 0
6463
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 49370, 16
65-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
66-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
64+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
6765
; CHECK-NEXT: RET undef $lr
6866
renamable $x8 = MOVi64imm -4550323095879417536
6967
STRXui killed renamable $x8, killed renamable $x0, 0
@@ -79,8 +77,7 @@ body: |
7977
; CHECK: liveins: $x0
8078
; CHECK-NEXT: {{ $}}
8179
; CHECK-NEXT: renamable $x8 = MOVZXi 21845, 16
82-
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 21845, 48
83-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
80+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
8481
; CHECK-NEXT: RET undef $lr
8582
renamable $x8 = MOVZXi 21845, 16
8683
renamable $x8 = MOVKXi $x8, 21845, 48
@@ -98,8 +95,7 @@ body: |
9895
; CHECK-NEXT: {{ $}}
9996
; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0
10097
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16
101-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
102-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
98+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
10399
; CHECK-NEXT: RET undef $lr
104100
renamable $x8 = MOVZXi 22136, 0
105101
renamable $x8 = MOVKXi $x8, 4660, 16
@@ -118,8 +114,7 @@ body: |
118114
; CHECK-NEXT: {{ $}}
119115
; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0
120116
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16
121-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
122-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 31
117+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 62
123118
; CHECK-NEXT: RET undef $lr
124119
renamable $x8 = MOVZXi 22136, 0
125120
renamable $x8 = MOVKXi $x8, 4660, 16
@@ -138,8 +133,7 @@ body: |
138133
; CHECK-NEXT: {{ $}}
139134
; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0
140135
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16
141-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
142-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
136+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
143137
; CHECK-NEXT: RET undef $lr
144138
renamable $x8 = MOVZXi 22136, 0
145139
renamable $x8 = MOVKXi $x8, 4660, 16

0 commit comments

Comments
 (0)