Skip to content

Commit e4672a0

Browse files
committed
[AArch64] Optimize when storing symmetry constants
This change looks for instructions of storing symmetric constants instruction 32-bit units. usually consisting of several 'MOV' and one or less 'ORR'. If found, load only the lower 32-bit constant and change it to copy and save to the upper 32-bit using the 'STP' instruction. For example: renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = ORRXrs $x8, $x8, 32 STRXui killed renamable $x8, killed renamable $x0, 0 becomes $w8 = MOVZWi 49370, 0 $w8 = MOVKWi $w8, 320, 16 STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
1 parent ed551e3 commit e4672a0

File tree

3 files changed

+199
-25
lines changed

3 files changed

+199
-25
lines changed

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,14 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201201
// Find and merge a base register updates before or after a ld/st instruction.
202202
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
203203

204+
// Finds and collapses loads of symmetric constant value.
205+
bool tryFoldSymmetryConstantLoad(MachineBasicBlock::iterator &I,
206+
unsigned Limit);
207+
MachineBasicBlock::iterator
208+
doFoldSymmetryConstantLoad(MachineInstr &MI,
209+
SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
210+
int SuccIndex, int Accumulated);
211+
204212
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205213

206214
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2252,6 +2260,159 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22522260
return E;
22532261
}
22542262

2263+
static bool isSymmetric(MachineInstr &MI, Register BaseReg) {
2264+
auto MatchBaseReg = [&](unsigned Count) {
2265+
for (unsigned I = 0; I < Count; I++) {
2266+
auto OpI = MI.getOperand(I);
2267+
if (OpI.isReg() && OpI.getReg() != BaseReg)
2268+
return false;
2269+
}
2270+
return true;
2271+
};
2272+
2273+
unsigned Opc = MI.getOpcode();
2274+
switch (Opc) {
2275+
default:
2276+
return false;
2277+
case AArch64::MOVZXi:
2278+
return MatchBaseReg(1);
2279+
case AArch64::MOVKXi:
2280+
return MatchBaseReg(2);
2281+
case AArch64::ORRXrs:
2282+
MachineOperand &Imm = MI.getOperand(3);
2283+
// Fourth operand of ORR must be 32 which mean
2284+
// 32bit symmetric constant load.
2285+
// ex) renamable $x8 = ORRXrs $x8, $x8, 32
2286+
if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == 32)
2287+
return true;
2288+
}
2289+
2290+
return false;
2291+
}
2292+
2293+
MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad(
2294+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2295+
int SuccIndex, int Accumulated) {
2296+
MachineBasicBlock::iterator I = MI.getIterator();
2297+
MachineBasicBlock::iterator E = I->getParent()->end();
2298+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2299+
MachineBasicBlock::iterator FirstMovI;
2300+
MachineBasicBlock *MBB = MI.getParent();
2301+
uint64_t Mask = 0xFFFFUL;
2302+
int Index = 0;
2303+
2304+
for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
2305+
if (Index == SuccIndex - 1) {
2306+
FirstMovI = *MI;
2307+
break;
2308+
}
2309+
(*MI)->eraseFromParent();
2310+
}
2311+
2312+
Register DstRegW =
2313+
TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
2314+
int Lower = Accumulated & Mask;
2315+
if (Lower) {
2316+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
2317+
TII->get(AArch64::MOVZWi), DstRegW)
2318+
.addImm(Lower)
2319+
.addImm(0);
2320+
Lower = Accumulated >> 16 & Mask;
2321+
if (Lower) {
2322+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
2323+
TII->get(AArch64::MOVKWi), DstRegW)
2324+
.addUse(DstRegW)
2325+
.addImm(Lower)
2326+
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
2327+
}
2328+
} else {
2329+
Lower = Accumulated >> 16 & Mask;
2330+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
2331+
TII->get(AArch64::MOVZWi), DstRegW)
2332+
.addImm(Lower)
2333+
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
2334+
}
2335+
FirstMovI->eraseFromParent();
2336+
Register BaseReg = getLdStRegOp(MI).getReg();
2337+
const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
2338+
DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
2339+
unsigned DstRegState = getRegState(MI.getOperand(0));
2340+
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
2341+
.addReg(DstRegW, DstRegState)
2342+
.addReg(DstRegW, DstRegState)
2343+
.addReg(MO.getReg(), getRegState(MO))
2344+
.add(AArch64InstrInfo::getLdStOffsetOp(MI))
2345+
.setMemRefs(MI.memoperands())
2346+
.setMIFlags(MI.getFlags());
2347+
I->eraseFromParent();
2348+
2349+
return NextI;
2350+
}
2351+
2352+
bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad(
2353+
MachineBasicBlock::iterator &I, unsigned Limit) {
2354+
MachineInstr &MI = *I;
2355+
if (MI.getOpcode() != AArch64::STRXui)
2356+
return false;
2357+
2358+
MachineBasicBlock::iterator MBBI = I;
2359+
MachineBasicBlock::iterator B = I->getParent()->begin();
2360+
if (MBBI == B)
2361+
return false;
2362+
2363+
Register BaseReg = getLdStRegOp(MI).getReg();
2364+
unsigned Count = 0, SuccIndex = 0;
2365+
bool hasORR = false;
2366+
SmallVector<MachineBasicBlock::iterator> MIs;
2367+
ModifiedRegUnits.clear();
2368+
UsedRegUnits.clear();
2369+
2370+
uint64_t IValue, IShift, Accumulated = 0, Mask = 0xFFFFUL;
2371+
do {
2372+
MBBI = prev_nodbg(MBBI, B);
2373+
MachineInstr &MI = *MBBI;
2374+
if (!MI.isTransient())
2375+
++Count;
2376+
if (!isSymmetric(MI, BaseReg)) {
2377+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2378+
TRI);
2379+
if (!ModifiedRegUnits.available(BaseReg) ||
2380+
!UsedRegUnits.available(BaseReg))
2381+
break;
2382+
continue;
2383+
}
2384+
2385+
unsigned Opc = MI.getOpcode();
2386+
if (Opc == AArch64::ORRXrs) {
2387+
hasORR = true;
2388+
MIs.push_back(MBBI);
2389+
continue;
2390+
}
2391+
unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
2392+
MachineOperand Value = MI.getOperand(ValueOrder);
2393+
MachineOperand Shift = MI.getOperand(ValueOrder + 1);
2394+
if (!Value.isImm() || !Shift.isImm())
2395+
return false;
2396+
2397+
IValue = Value.getImm();
2398+
IShift = Shift.getImm();
2399+
Accumulated -= (Accumulated & (Mask << IShift));
2400+
Accumulated += (IValue << IShift);
2401+
MIs.push_back(MBBI);
2402+
if ((Accumulated != 0) &&
2403+
(((Accumulated >> 32) == (Accumulated & 0xffffffffULL)) ||
2404+
((hasORR && Accumulated >> 32 == 0))))
2405+
SuccIndex = MIs.size();
2406+
} while (MBBI != B && Count < Limit);
2407+
2408+
if (SuccIndex) {
2409+
I = doFoldSymmetryConstantLoad(MI, MIs, SuccIndex, Accumulated);
2410+
return true;
2411+
}
2412+
2413+
return false;
2414+
}
2415+
22552416
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
22562417
MachineBasicBlock::iterator &MBBI) {
22572418
MachineInstr &MI = *MBBI;
@@ -2518,6 +2679,26 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
25182679
++MBBI;
25192680
}
25202681

2682+
// We have an opportunity to optimize the `STRXui` instruction, which loads
2683+
// the same 32-bit value into a register twice. The `STPXi` instruction allows
2684+
// us to load a 32-bit value only once.
2685+
// Considering :
2686+
// renamable $x8 = MOVZXi 49370, 0
2687+
// renamable $x8 = MOVKXi $x8, 320, 16
2688+
// renamable $x8 = ORRXrs $x8, $x8, 32
2689+
// STRXui killed renamable $x8, killed renamable $x0, 0
2690+
// Transform :
2691+
// $w8 = MOVZWi 49370, 0
2692+
// $w8 = MOVKWi $w8, 320, 16
2693+
// STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
2694+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2695+
MBBI != E;) {
2696+
if (tryFoldSymmetryConstantLoad(MBBI, UpdateLimit))
2697+
Modified = true;
2698+
else
2699+
++MBBI;
2700+
}
2701+
25212702
return Modified;
25222703
}
25232704

llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,9 @@ define i64 @testuu0xf555f555f555f555() {
9797
define void @test_store_0x1234567812345678(ptr %x) {
9898
; CHECK-LABEL: test_store_0x1234567812345678:
9999
; CHECK: // %bb.0:
100-
; CHECK-NEXT: mov x8, #22136 // =0x5678
101-
; CHECK-NEXT: movk x8, #4660, lsl #16
102-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
103-
; CHECK-NEXT: str x8, [x0]
100+
; CHECK-NEXT: mov w8, #22136 // =0x5678
101+
; CHECK-NEXT: movk w8, #4660, lsl #16
102+
; CHECK-NEXT: stp w8, w8, [x0]
104103
; CHECK-NEXT: ret
105104
store i64 u0x1234567812345678, ptr %x
106105
ret void
@@ -109,10 +108,9 @@ define void @test_store_0x1234567812345678(ptr %x) {
109108
define void @test_store_0xff3456ffff3456ff(ptr %x) {
110109
; CHECK-LABEL: test_store_0xff3456ffff3456ff:
111110
; CHECK: // %bb.0:
112-
; CHECK-NEXT: mov x8, #22271 // =0x56ff
113-
; CHECK-NEXT: movk x8, #65332, lsl #16
114-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
115-
; CHECK-NEXT: str x8, [x0]
111+
; CHECK-NEXT: mov w8, #22271 // =0x56ff
112+
; CHECK-NEXT: movk w8, #65332, lsl #16
113+
; CHECK-NEXT: stp w8, w8, [x0]
116114
; CHECK-NEXT: ret
117115
store i64 u0xff3456ffff3456ff, ptr %x
118116
ret void
@@ -165,9 +163,8 @@ define void @test_store_0x0000555555555555(ptr %x) {
165163
define void @test_store_0x0000555500005555(ptr %x) {
166164
; CHECK-LABEL: test_store_0x0000555500005555:
167165
; CHECK: // %bb.0:
168-
; CHECK-NEXT: mov x8, #21845 // =0x5555
169-
; CHECK-NEXT: movk x8, #21845, lsl #32
170-
; CHECK-NEXT: str x8, [x0]
166+
; CHECK-NEXT: mov w8, #21845 // =0x5555
167+
; CHECK-NEXT: stp w8, w8, [x0]
171168
; CHECK-NEXT: ret
172169
store i64 u0x0000555500005555, ptr %x
173170
ret void
@@ -176,9 +173,8 @@ define void @test_store_0x0000555500005555(ptr %x) {
176173
define void @test_store_0x5555000055550000(ptr %x) {
177174
; CHECK-LABEL: test_store_0x5555000055550000:
178175
; CHECK: // %bb.0:
179-
; CHECK-NEXT: mov x8, #1431633920 // =0x55550000
180-
; CHECK-NEXT: movk x8, #21845, lsl #48
181-
; CHECK-NEXT: str x8, [x0]
176+
; CHECK-NEXT: mov w8, #1431633920 // =0x55550000
177+
; CHECK-NEXT: stp w8, w8, [x0]
182178
; CHECK-NEXT: ret
183179
store i64 u0x5555000055550000, ptr %x
184180
ret void

llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,9 @@ body: |
4242
; CHECK-LABEL: name: test_fold_repeating_constant_store
4343
; CHECK: liveins: $x0
4444
; CHECK-NEXT: {{ $}}
45-
; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
46-
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
47-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
48-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
45+
; CHECK-NEXT: $w8 = MOVZWi 49370, 0
46+
; CHECK-NEXT: $w8 = MOVKWi $w8, 320, 16
47+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
4948
; CHECK-NEXT: RET undef $lr
5049
renamable $x8 = MOVi64imm 90284035103834330
5150
STRXui killed renamable $x8, killed renamable $x0, 0
@@ -60,10 +59,9 @@ body: |
6059
; CHECK-LABEL: name: test_fold_repeating_constant_store_neg
6160
; CHECK: liveins: $x0
6261
; CHECK-NEXT: {{ $}}
63-
; CHECK-NEXT: renamable $x8 = MOVZXi 320, 0
64-
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 49370, 16
65-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
66-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
62+
; CHECK-NEXT: $w8 = MOVZWi 320, 0
63+
; CHECK-NEXT: $w8 = MOVKWi $w8, 49370, 16
64+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
6765
; CHECK-NEXT: RET undef $lr
6866
renamable $x8 = MOVi64imm -4550323095879417536
6967
STRXui killed renamable $x8, killed renamable $x0, 0
@@ -78,9 +76,8 @@ body: |
7876
; CHECK-LABEL: name: test_fold_repeating_constant_store_16bit_unit
7977
; CHECK: liveins: $x0
8078
; CHECK-NEXT: {{ $}}
81-
; CHECK-NEXT: renamable $x8 = MOVZXi 21845, 16
82-
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 21845, 48
83-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
79+
; CHECK-NEXT: $w8 = MOVZWi 21845, 16
80+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
8481
; CHECK-NEXT: RET undef $lr
8582
renamable $x8 = MOVZXi 21845, 16
8683
renamable $x8 = MOVKXi $x8, 21845, 48

0 commit comments

Comments
 (0)