Skip to content

Commit 1dcab36

Browse files
committed
[AArch64] Optimize when storing symmetry constants
This change looks for instructions of storing symmetric constants instruction 32-bit units. usually consisting of several 'MOV' and one or less 'ORR'. If found, load only the lower 32-bit constant and change it to copy and save to the upper 32-bit using the 'STP' instruction. For example: renamable $x8 = MOVZXi 49370, 0 renamable $x8 = MOVKXi $x8, 320, 16 renamable $x8 = ORRXrs $x8, $x8, 32 STRXui killed renamable $x8, killed renamable $x0, 0 becomes $w8 = MOVZWi 49370, 0 $w8 = MOVKWi $w8, 320, 16 STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
1 parent ed551e3 commit 1dcab36

File tree

3 files changed

+205
-25
lines changed

3 files changed

+205
-25
lines changed

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201201
// Find and merge a base register updates before or after a ld/st instruction.
202202
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
203203

204+
// Finds and collapses loads of repeated constant values.
205+
bool foldSymmetryConstantLoads(MachineBasicBlock::iterator &I,
206+
unsigned Limit);
207+
MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
208+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
209+
int SuccIndex, int Accumulated);
210+
204211
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205212

206213
bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2252,6 +2259,166 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22522259
return E;
22532260
}
22542261

2262+
static bool isSymmetric(MachineInstr &MI, Register BaseReg) {
2263+
auto MatchBaseReg = [&](unsigned Count) {
2264+
for (unsigned I = 0; I < Count; I++) {
2265+
auto OpI = MI.getOperand(I);
2266+
if (OpI.isReg() && OpI.getReg() != BaseReg)
2267+
return false;
2268+
}
2269+
return true;
2270+
};
2271+
2272+
unsigned Opc = MI.getOpcode();
2273+
switch (Opc) {
2274+
default:
2275+
return false;
2276+
case AArch64::MOVZXi:
2277+
return MatchBaseReg(1);
2278+
case AArch64::MOVKXi:
2279+
return MatchBaseReg(2);
2280+
case AArch64::ORRXrs:
2281+
MachineOperand &Imm = MI.getOperand(3);
2282+
// Fourth operand of ORR must be 32 which mean 32bit symmetric constant load.
2283+
// ex) renamable $x8 = ORRXrs $x8, $x8, 32
2284+
if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == 32)
2285+
return true;
2286+
}
2287+
2288+
return false;
2289+
}
2290+
2291+
MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
2292+
MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2293+
int SuccIndex, int Accumulated) {
2294+
MachineBasicBlock::iterator I = MI.getIterator();
2295+
MachineBasicBlock::iterator E = I->getParent()->end();
2296+
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2297+
MachineBasicBlock::iterator FirstMovI;
2298+
MachineBasicBlock *MBB = MI.getParent();
2299+
uint64_t Mask = 0xFFFFUL;
2300+
int Index = 0;
2301+
2302+
for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
2303+
if (Index == SuccIndex - 1) {
2304+
FirstMovI = *MI;
2305+
break;
2306+
}
2307+
(*MI)->eraseFromParent();
2308+
}
2309+
2310+
Register DstRegW =
2311+
TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
2312+
int Lower = Accumulated & Mask;
2313+
if (Lower) {
2314+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
2315+
TII->get(AArch64::MOVZWi), DstRegW)
2316+
.addImm(Lower)
2317+
.addImm(0);
2318+
Lower = Accumulated >> 16 & Mask;
2319+
if (Lower) {
2320+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
2321+
TII->get(AArch64::MOVKWi), DstRegW)
2322+
.addUse(DstRegW)
2323+
.addImm(Lower)
2324+
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
2325+
}
2326+
} else {
2327+
Lower = Accumulated >> 16 & Mask;
2328+
BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
2329+
TII->get(AArch64::MOVZWi), DstRegW)
2330+
.addImm(Lower)
2331+
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
2332+
}
2333+
FirstMovI->eraseFromParent();
2334+
Register BaseReg = getLdStRegOp(MI).getReg();
2335+
const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
2336+
DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
2337+
unsigned DstRegState = getRegState(MI.getOperand(0));
2338+
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
2339+
.addReg(DstRegW, DstRegState)
2340+
.addReg(DstRegW, DstRegState)
2341+
.addReg(MO.getReg(), getRegState(MO))
2342+
.add(AArch64InstrInfo::getLdStOffsetOp(MI))
2343+
.setMemRefs(MI.memoperands())
2344+
.setMIFlags(MI.getFlags());
2345+
I->eraseFromParent();
2346+
2347+
return NextI;
2348+
}
2349+
2350+
bool AArch64LoadStoreOpt::foldSymmetryConstantLoads(
2351+
MachineBasicBlock::iterator &I, unsigned Limit) {
2352+
MachineInstr &MI = *I;
2353+
if (MI.getOpcode() != AArch64::STRXui)
2354+
return false;
2355+
2356+
MachineBasicBlock::iterator MBBI = I;
2357+
MachineBasicBlock::iterator B = I->getParent()->begin();
2358+
if (MBBI == B)
2359+
return false;
2360+
2361+
Register BaseReg = getLdStRegOp(MI).getReg();
2362+
unsigned Count = 0, SuccIndex = 0, DupBitSize = 0;
2363+
SmallVector<MachineBasicBlock::iterator> MIs;
2364+
ModifiedRegUnits.clear();
2365+
UsedRegUnits.clear();
2366+
2367+
uint64_t IValue, IShift, Accumulated = 0, Mask = 0xFFFFUL;
2368+
do {
2369+
MBBI = prev_nodbg(MBBI, B);
2370+
MachineInstr &MI = *MBBI;
2371+
if (!MI.isTransient())
2372+
++Count;
2373+
if (!isSymmetric(MI, BaseReg)) {
2374+
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2375+
TRI);
2376+
if (!ModifiedRegUnits.available(BaseReg) ||
2377+
!UsedRegUnits.available(BaseReg))
2378+
break;
2379+
continue;
2380+
}
2381+
2382+
unsigned Opc = MI.getOpcode();
2383+
if (Opc == AArch64::ORRXrs) {
2384+
DupBitSize = 32;
2385+
MIs.push_back(MBBI);
2386+
continue;
2387+
}
2388+
unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
2389+
MachineOperand Value = MI.getOperand(ValueOrder);
2390+
MachineOperand Shift = MI.getOperand(ValueOrder + 1);
2391+
if (!Value.isImm() || !Shift.isImm())
2392+
return false;
2393+
2394+
IValue = Value.getImm();
2395+
IShift = Shift.getImm();
2396+
Accumulated -= (Accumulated & (Mask << IShift));
2397+
Accumulated += (IValue << IShift);
2398+
// We assume that 64bit constant loading starts with MOVZXi
2399+
// ex)
2400+
// renamable $x8 = MOVZXi 49370, 0
2401+
// renamable $x8 = MOVKXi $x8, 320, 16
2402+
// renamable $x8 = ORRXrs $x8, $x8, 32
2403+
if (Opc == AArch64::MOVZXi && DupBitSize) {
2404+
Accumulated |= Accumulated << DupBitSize;
2405+
DupBitSize = 0;
2406+
}
2407+
2408+
MIs.push_back(MBBI);
2409+
if (Accumulated != 0 &&
2410+
(Accumulated >> 32) == (Accumulated & 0xffffffffULL))
2411+
SuccIndex = MIs.size();
2412+
} while (MBBI != B && Count < Limit);
2413+
2414+
if (SuccIndex) {
2415+
I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
2416+
return true;
2417+
}
2418+
2419+
return false;
2420+
}
2421+
22552422
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
22562423
MachineBasicBlock::iterator &MBBI) {
22572424
MachineInstr &MI = *MBBI;
@@ -2518,6 +2685,26 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
25182685
++MBBI;
25192686
}
25202687

2688+
// We have an opportunity to optimize the `STRXui` instruction, which loads
2689+
// the same 32-bit value into a register twice. The `STPXi` instruction allows
2690+
// us to load a 32-bit value only once.
2691+
// Considering :
2692+
// renamable $x8 = MOVZXi 49370, 0
2693+
// renamable $x8 = MOVKXi $x8, 320, 16
2694+
// renamable $x8 = ORRXrs $x8, $x8, 32
2695+
// STRXui killed renamable $x8, killed renamable $x0, 0
2696+
// Transform :
2697+
// $w8 = MOVZWi 49370, 0
2698+
// $w8 = MOVKWi $w8, 320, 16
2699+
// STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
2700+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2701+
MBBI != E;) {
2702+
if (foldSymmetryConstantLoads(MBBI, UpdateLimit))
2703+
Modified = true;
2704+
else
2705+
++MBBI;
2706+
}
2707+
25212708
return Modified;
25222709
}
25232710

llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,9 @@ define i64 @testuu0xf555f555f555f555() {
9797
define void @test_store_0x1234567812345678(ptr %x) {
9898
; CHECK-LABEL: test_store_0x1234567812345678:
9999
; CHECK: // %bb.0:
100-
; CHECK-NEXT: mov x8, #22136 // =0x5678
101-
; CHECK-NEXT: movk x8, #4660, lsl #16
102-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
103-
; CHECK-NEXT: str x8, [x0]
100+
; CHECK-NEXT: mov w8, #22136 // =0x5678
101+
; CHECK-NEXT: movk w8, #4660, lsl #16
102+
; CHECK-NEXT: stp w8, w8, [x0]
104103
; CHECK-NEXT: ret
105104
store i64 u0x1234567812345678, ptr %x
106105
ret void
@@ -109,10 +108,9 @@ define void @test_store_0x1234567812345678(ptr %x) {
109108
define void @test_store_0xff3456ffff3456ff(ptr %x) {
110109
; CHECK-LABEL: test_store_0xff3456ffff3456ff:
111110
; CHECK: // %bb.0:
112-
; CHECK-NEXT: mov x8, #22271 // =0x56ff
113-
; CHECK-NEXT: movk x8, #65332, lsl #16
114-
; CHECK-NEXT: orr x8, x8, x8, lsl #32
115-
; CHECK-NEXT: str x8, [x0]
111+
; CHECK-NEXT: mov w8, #22271 // =0x56ff
112+
; CHECK-NEXT: movk w8, #65332, lsl #16
113+
; CHECK-NEXT: stp w8, w8, [x0]
116114
; CHECK-NEXT: ret
117115
store i64 u0xff3456ffff3456ff, ptr %x
118116
ret void
@@ -165,9 +163,8 @@ define void @test_store_0x0000555555555555(ptr %x) {
165163
define void @test_store_0x0000555500005555(ptr %x) {
166164
; CHECK-LABEL: test_store_0x0000555500005555:
167165
; CHECK: // %bb.0:
168-
; CHECK-NEXT: mov x8, #21845 // =0x5555
169-
; CHECK-NEXT: movk x8, #21845, lsl #32
170-
; CHECK-NEXT: str x8, [x0]
166+
; CHECK-NEXT: mov w8, #21845 // =0x5555
167+
; CHECK-NEXT: stp w8, w8, [x0]
171168
; CHECK-NEXT: ret
172169
store i64 u0x0000555500005555, ptr %x
173170
ret void
@@ -176,9 +173,8 @@ define void @test_store_0x0000555500005555(ptr %x) {
176173
define void @test_store_0x5555000055550000(ptr %x) {
177174
; CHECK-LABEL: test_store_0x5555000055550000:
178175
; CHECK: // %bb.0:
179-
; CHECK-NEXT: mov x8, #1431633920 // =0x55550000
180-
; CHECK-NEXT: movk x8, #21845, lsl #48
181-
; CHECK-NEXT: str x8, [x0]
176+
; CHECK-NEXT: mov w8, #1431633920 // =0x55550000
177+
; CHECK-NEXT: stp w8, w8, [x0]
182178
; CHECK-NEXT: ret
183179
store i64 u0x5555000055550000, ptr %x
184180
ret void

llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,9 @@ body: |
4242
; CHECK-LABEL: name: test_fold_repeating_constant_store
4343
; CHECK: liveins: $x0
4444
; CHECK-NEXT: {{ $}}
45-
; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
46-
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
47-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
48-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
45+
; CHECK-NEXT: $w8 = MOVZWi 49370, 0
46+
; CHECK-NEXT: $w8 = MOVKWi $w8, 320, 16
47+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
4948
; CHECK-NEXT: RET undef $lr
5049
renamable $x8 = MOVi64imm 90284035103834330
5150
STRXui killed renamable $x8, killed renamable $x0, 0
@@ -60,10 +59,9 @@ body: |
6059
; CHECK-LABEL: name: test_fold_repeating_constant_store_neg
6160
; CHECK: liveins: $x0
6261
; CHECK-NEXT: {{ $}}
63-
; CHECK-NEXT: renamable $x8 = MOVZXi 320, 0
64-
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 49370, 16
65-
; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
66-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
62+
; CHECK-NEXT: $w8 = MOVZWi 320, 0
63+
; CHECK-NEXT: $w8 = MOVKWi $w8, 49370, 16
64+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
6765
; CHECK-NEXT: RET undef $lr
6866
renamable $x8 = MOVi64imm -4550323095879417536
6967
STRXui killed renamable $x8, killed renamable $x0, 0
@@ -78,9 +76,8 @@ body: |
7876
; CHECK-LABEL: name: test_fold_repeating_constant_store_16bit_unit
7977
; CHECK: liveins: $x0
8078
; CHECK-NEXT: {{ $}}
81-
; CHECK-NEXT: renamable $x8 = MOVZXi 21845, 16
82-
; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 21845, 48
83-
; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
79+
; CHECK-NEXT: $w8 = MOVZWi 21845, 16
80+
; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
8481
; CHECK-NEXT: RET undef $lr
8582
renamable $x8 = MOVZXi 21845, 16
8683
renamable $x8 = MOVKXi $x8, 21845, 48

0 commit comments

Comments
 (0)