@@ -201,6 +201,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201
201
// Find and merge a base register updates before or after a ld/st instruction.
202
202
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
203
203
204
+ // Finds and collapses loads of repeated constant values.
205
+ bool foldSymmetryConstantLoads (MachineBasicBlock::iterator &I,
206
+ unsigned Limit);
207
+ MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads (
208
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
209
+ int SuccIndex, int Accumulated);
210
+
204
211
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205
212
206
213
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -2252,6 +2259,167 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2252
2259
return E;
2253
2260
}
2254
2261
2262
+ static bool isSymmetric (MachineInstr &MI, Register BaseReg) {
2263
+ auto MatchBaseReg = [&](unsigned Count) {
2264
+ for (unsigned I = 0 ; I < Count; I++) {
2265
+ auto OpI = MI.getOperand (I);
2266
+ if (OpI.isReg () && OpI.getReg () != BaseReg)
2267
+ return false ;
2268
+ }
2269
+ return true ;
2270
+ };
2271
+
2272
+ unsigned Opc = MI.getOpcode ();
2273
+ switch (Opc) {
2274
+ default :
2275
+ return false ;
2276
+ case AArch64::MOVZXi:
2277
+ return MatchBaseReg (1 );
2278
+ case AArch64::MOVKXi:
2279
+ return MatchBaseReg (2 );
2280
+ case AArch64::ORRXrs:
2281
+ MachineOperand &Imm = MI.getOperand (3 );
2282
+ // Fourth operand of ORR must be 32 which mean
2283
+ // 32bit symmetric constant load.
2284
+ // ex) renamable $x8 = ORRXrs $x8, $x8, 32
2285
+ if (MatchBaseReg (3 ) && Imm.isImm () && Imm.getImm () == 32 )
2286
+ return true ;
2287
+ }
2288
+
2289
+ return false ;
2290
+ }
2291
+
2292
+ MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads (
2293
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2294
+ int SuccIndex, int Accumulated) {
2295
+ MachineBasicBlock::iterator I = MI.getIterator ();
2296
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2297
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2298
+ MachineBasicBlock::iterator FirstMovI;
2299
+ MachineBasicBlock *MBB = MI.getParent ();
2300
+ uint64_t Mask = 0xFFFFUL ;
2301
+ int Index = 0 ;
2302
+
2303
+ for (auto MI = MIs.begin (), E = MIs.end (); MI != E; ++MI, Index++) {
2304
+ if (Index == SuccIndex - 1 ) {
2305
+ FirstMovI = *MI;
2306
+ break ;
2307
+ }
2308
+ (*MI)->eraseFromParent ();
2309
+ }
2310
+
2311
+ Register DstRegW =
2312
+ TRI->getSubReg (FirstMovI->getOperand (0 ).getReg (), AArch64::sub_32);
2313
+ int Lower = Accumulated & Mask;
2314
+ if (Lower) {
2315
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2316
+ TII->get (AArch64::MOVZWi), DstRegW)
2317
+ .addImm (Lower)
2318
+ .addImm (0 );
2319
+ Lower = Accumulated >> 16 & Mask;
2320
+ if (Lower) {
2321
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2322
+ TII->get (AArch64::MOVKWi), DstRegW)
2323
+ .addUse (DstRegW)
2324
+ .addImm (Lower)
2325
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2326
+ }
2327
+ } else {
2328
+ Lower = Accumulated >> 16 & Mask;
2329
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2330
+ TII->get (AArch64::MOVZWi), DstRegW)
2331
+ .addImm (Lower)
2332
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2333
+ }
2334
+ FirstMovI->eraseFromParent ();
2335
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2336
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp (MI);
2337
+ DstRegW = TRI->getSubReg (BaseReg, AArch64::sub_32);
2338
+ unsigned DstRegState = getRegState (MI.getOperand (0 ));
2339
+ BuildMI (*MBB, MI, MI.getDebugLoc (), TII->get (AArch64::STPWi))
2340
+ .addReg (DstRegW, DstRegState)
2341
+ .addReg (DstRegW, DstRegState)
2342
+ .addReg (MO.getReg (), getRegState (MO))
2343
+ .add (AArch64InstrInfo::getLdStOffsetOp (MI))
2344
+ .setMemRefs (MI.memoperands ())
2345
+ .setMIFlags (MI.getFlags ());
2346
+ I->eraseFromParent ();
2347
+
2348
+ return NextI;
2349
+ }
2350
+
2351
+ bool AArch64LoadStoreOpt::foldSymmetryConstantLoads (
2352
+ MachineBasicBlock::iterator &I, unsigned Limit) {
2353
+ MachineInstr &MI = *I;
2354
+ if (MI.getOpcode () != AArch64::STRXui)
2355
+ return false ;
2356
+
2357
+ MachineBasicBlock::iterator MBBI = I;
2358
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2359
+ if (MBBI == B)
2360
+ return false ;
2361
+
2362
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2363
+ unsigned Count = 0 , SuccIndex = 0 , DupBitSize = 0 ;
2364
+ SmallVector<MachineBasicBlock::iterator> MIs;
2365
+ ModifiedRegUnits.clear ();
2366
+ UsedRegUnits.clear ();
2367
+
2368
+ uint64_t IValue, IShift, Accumulated = 0 , Mask = 0xFFFFUL ;
2369
+ do {
2370
+ MBBI = prev_nodbg (MBBI, B);
2371
+ MachineInstr &MI = *MBBI;
2372
+ if (!MI.isTransient ())
2373
+ ++Count;
2374
+ if (!isSymmetric (MI, BaseReg)) {
2375
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits,
2376
+ TRI);
2377
+ if (!ModifiedRegUnits.available (BaseReg) ||
2378
+ !UsedRegUnits.available (BaseReg))
2379
+ break ;
2380
+ continue ;
2381
+ }
2382
+
2383
+ unsigned Opc = MI.getOpcode ();
2384
+ if (Opc == AArch64::ORRXrs) {
2385
+ DupBitSize = 32 ;
2386
+ MIs.push_back (MBBI);
2387
+ continue ;
2388
+ }
2389
+ unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2 ;
2390
+ MachineOperand Value = MI.getOperand (ValueOrder);
2391
+ MachineOperand Shift = MI.getOperand (ValueOrder + 1 );
2392
+ if (!Value.isImm () || !Shift.isImm ())
2393
+ return false ;
2394
+
2395
+ IValue = Value.getImm ();
2396
+ IShift = Shift.getImm ();
2397
+ Accumulated -= (Accumulated & (Mask << IShift));
2398
+ Accumulated += (IValue << IShift);
2399
+ // We assume that 64bit constant loading starts with MOVZXi
2400
+ // ex)
2401
+ // renamable $x8 = MOVZXi 49370, 0
2402
+ // renamable $x8 = MOVKXi $x8, 320, 16
2403
+ // renamable $x8 = ORRXrs $x8, $x8, 32
2404
+ if (Opc == AArch64::MOVZXi && DupBitSize) {
2405
+ Accumulated |= Accumulated << DupBitSize;
2406
+ DupBitSize = 0 ;
2407
+ }
2408
+
2409
+ MIs.push_back (MBBI);
2410
+ if (Accumulated != 0 &&
2411
+ (Accumulated >> 32 ) == (Accumulated & 0xffffffffULL ))
2412
+ SuccIndex = MIs.size ();
2413
+ } while (MBBI != B && Count < Limit);
2414
+
2415
+ if (SuccIndex) {
2416
+ I = tryToFoldRepeatedConstantLoads (MI, MIs, SuccIndex, Accumulated);
2417
+ return true ;
2418
+ }
2419
+
2420
+ return false ;
2421
+ }
2422
+
2255
2423
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2256
2424
MachineBasicBlock::iterator &MBBI) {
2257
2425
MachineInstr &MI = *MBBI;
@@ -2518,6 +2686,26 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2518
2686
++MBBI;
2519
2687
}
2520
2688
2689
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
2690
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
2691
+ // us to load a 32-bit value only once.
2692
+ // Considering :
2693
+ // renamable $x8 = MOVZXi 49370, 0
2694
+ // renamable $x8 = MOVKXi $x8, 320, 16
2695
+ // renamable $x8 = ORRXrs $x8, $x8, 32
2696
+ // STRXui killed renamable $x8, killed renamable $x0, 0
2697
+ // Transform :
2698
+ // $w8 = MOVZWi 49370, 0
2699
+ // $w8 = MOVKWi $w8, 320, 16
2700
+ // STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
2701
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2702
+ MBBI != E;) {
2703
+ if (foldSymmetryConstantLoads (MBBI, UpdateLimit))
2704
+ Modified = true ;
2705
+ else
2706
+ ++MBBI;
2707
+ }
2708
+
2521
2709
return Modified;
2522
2710
}
2523
2711
0 commit comments