@@ -201,6 +201,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201
201
// Find and merge a base register updates before or after a ld/st instruction.
202
202
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
203
203
204
+ // Finds and collapses loads of repeated constant values.
205
+ bool foldSymmetryConstantLoads (MachineBasicBlock::iterator &I,
206
+ unsigned Limit);
207
+ MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads (
208
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
209
+ int SuccIndex, int Accumulated);
210
+
204
211
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205
212
206
213
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -2252,6 +2259,166 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2252
2259
return E;
2253
2260
}
2254
2261
2262
+ static bool isSymmetric (MachineInstr &MI, Register BaseReg) {
2263
+ auto MatchBaseReg = [&](unsigned Count) {
2264
+ for (unsigned I = 0 ; I < Count; I++) {
2265
+ auto OpI = MI.getOperand (I);
2266
+ if (OpI.isReg () && OpI.getReg () != BaseReg)
2267
+ return false ;
2268
+ }
2269
+ return true ;
2270
+ };
2271
+
2272
+ unsigned Opc = MI.getOpcode ();
2273
+ switch (Opc) {
2274
+ default :
2275
+ return false ;
2276
+ case AArch64::MOVZXi:
2277
+ return MatchBaseReg (1 );
2278
+ case AArch64::MOVKXi:
2279
+ return MatchBaseReg (2 );
2280
+ case AArch64::ORRXrs:
2281
+ MachineOperand &Imm = MI.getOperand (3 );
2282
+ // Fourth operand of ORR must be 32 which mean 32bit symmetric constant load.
2283
+ // ex) renamable $x8 = ORRXrs $x8, $x8, 32
2284
+ if (MatchBaseReg (3 ) && Imm.isImm () && Imm.getImm () == 32 )
2285
+ return true ;
2286
+ }
2287
+
2288
+ return false ;
2289
+ }
2290
+
2291
+ MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads (
2292
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2293
+ int SuccIndex, int Accumulated) {
2294
+ MachineBasicBlock::iterator I = MI.getIterator ();
2295
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2296
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2297
+ MachineBasicBlock::iterator FirstMovI;
2298
+ MachineBasicBlock *MBB = MI.getParent ();
2299
+ uint64_t Mask = 0xFFFFUL ;
2300
+ int Index = 0 ;
2301
+
2302
+ for (auto MI = MIs.begin (), E = MIs.end (); MI != E; ++MI, Index++) {
2303
+ if (Index == SuccIndex - 1 ) {
2304
+ FirstMovI = *MI;
2305
+ break ;
2306
+ }
2307
+ (*MI)->eraseFromParent ();
2308
+ }
2309
+
2310
+ Register DstRegW =
2311
+ TRI->getSubReg (FirstMovI->getOperand (0 ).getReg (), AArch64::sub_32);
2312
+ int Lower = Accumulated & Mask;
2313
+ if (Lower) {
2314
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2315
+ TII->get (AArch64::MOVZWi), DstRegW)
2316
+ .addImm (Lower)
2317
+ .addImm (0 );
2318
+ Lower = Accumulated >> 16 & Mask;
2319
+ if (Lower) {
2320
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2321
+ TII->get (AArch64::MOVKWi), DstRegW)
2322
+ .addUse (DstRegW)
2323
+ .addImm (Lower)
2324
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2325
+ }
2326
+ } else {
2327
+ Lower = Accumulated >> 16 & Mask;
2328
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2329
+ TII->get (AArch64::MOVZWi), DstRegW)
2330
+ .addImm (Lower)
2331
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2332
+ }
2333
+ FirstMovI->eraseFromParent ();
2334
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2335
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp (MI);
2336
+ DstRegW = TRI->getSubReg (BaseReg, AArch64::sub_32);
2337
+ unsigned DstRegState = getRegState (MI.getOperand (0 ));
2338
+ BuildMI (*MBB, MI, MI.getDebugLoc (), TII->get (AArch64::STPWi))
2339
+ .addReg (DstRegW, DstRegState)
2340
+ .addReg (DstRegW, DstRegState)
2341
+ .addReg (MO.getReg (), getRegState (MO))
2342
+ .add (AArch64InstrInfo::getLdStOffsetOp (MI))
2343
+ .setMemRefs (MI.memoperands ())
2344
+ .setMIFlags (MI.getFlags ());
2345
+ I->eraseFromParent ();
2346
+
2347
+ return NextI;
2348
+ }
2349
+
2350
+ bool AArch64LoadStoreOpt::foldSymmetryConstantLoads (
2351
+ MachineBasicBlock::iterator &I, unsigned Limit) {
2352
+ MachineInstr &MI = *I;
2353
+ if (MI.getOpcode () != AArch64::STRXui)
2354
+ return false ;
2355
+
2356
+ MachineBasicBlock::iterator MBBI = I;
2357
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2358
+ if (MBBI == B)
2359
+ return false ;
2360
+
2361
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2362
+ unsigned Count = 0 , SuccIndex = 0 , DupBitSize = 0 ;
2363
+ SmallVector<MachineBasicBlock::iterator> MIs;
2364
+ ModifiedRegUnits.clear ();
2365
+ UsedRegUnits.clear ();
2366
+
2367
+ uint64_t IValue, IShift, Accumulated = 0 , Mask = 0xFFFFUL ;
2368
+ do {
2369
+ MBBI = prev_nodbg (MBBI, B);
2370
+ MachineInstr &MI = *MBBI;
2371
+ if (!MI.isTransient ())
2372
+ ++Count;
2373
+ if (!isSymmetric (MI, BaseReg)) {
2374
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits,
2375
+ TRI);
2376
+ if (!ModifiedRegUnits.available (BaseReg) ||
2377
+ !UsedRegUnits.available (BaseReg))
2378
+ break ;
2379
+ continue ;
2380
+ }
2381
+
2382
+ unsigned Opc = MI.getOpcode ();
2383
+ if (Opc == AArch64::ORRXrs) {
2384
+ DupBitSize = 32 ;
2385
+ MIs.push_back (MBBI);
2386
+ continue ;
2387
+ }
2388
+ unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2 ;
2389
+ MachineOperand Value = MI.getOperand (ValueOrder);
2390
+ MachineOperand Shift = MI.getOperand (ValueOrder + 1 );
2391
+ if (!Value.isImm () || !Shift.isImm ())
2392
+ return false ;
2393
+
2394
+ IValue = Value.getImm ();
2395
+ IShift = Shift.getImm ();
2396
+ Accumulated -= (Accumulated & (Mask << IShift));
2397
+ Accumulated += (IValue << IShift);
2398
+ // We assume that 64bit constant loading starts with MOVZXi
2399
+ // ex)
2400
+ // renamable $x8 = MOVZXi 49370, 0
2401
+ // renamable $x8 = MOVKXi $x8, 320, 16
2402
+ // renamable $x8 = ORRXrs $x8, $x8, 32
2403
+ if (Opc == AArch64::MOVZXi && DupBitSize) {
2404
+ Accumulated |= Accumulated << DupBitSize;
2405
+ DupBitSize = 0 ;
2406
+ }
2407
+
2408
+ MIs.push_back (MBBI);
2409
+ if (Accumulated != 0 &&
2410
+ (Accumulated >> 32 ) == (Accumulated & 0xffffffffULL ))
2411
+ SuccIndex = MIs.size ();
2412
+ } while (MBBI != B && Count < Limit);
2413
+
2414
+ if (SuccIndex) {
2415
+ I = tryToFoldRepeatedConstantLoads (MI, MIs, SuccIndex, Accumulated);
2416
+ return true ;
2417
+ }
2418
+
2419
+ return false ;
2420
+ }
2421
+
2255
2422
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2256
2423
MachineBasicBlock::iterator &MBBI) {
2257
2424
MachineInstr &MI = *MBBI;
@@ -2518,6 +2685,26 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2518
2685
++MBBI;
2519
2686
}
2520
2687
2688
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
2689
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
2690
+ // us to load a 32-bit value only once.
2691
+ // Considering :
2692
+ // renamable $x8 = MOVZXi 49370, 0
2693
+ // renamable $x8 = MOVKXi $x8, 320, 16
2694
+ // renamable $x8 = ORRXrs $x8, $x8, 32
2695
+ // STRXui killed renamable $x8, killed renamable $x0, 0
2696
+ // Transform :
2697
+ // $w8 = MOVZWi 49370, 0
2698
+ // $w8 = MOVKWi $w8, 320, 16
2699
+ // STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
2700
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2701
+ MBBI != E;) {
2702
+ if (foldSymmetryConstantLoads (MBBI, UpdateLimit))
2703
+ Modified = true ;
2704
+ else
2705
+ ++MBBI;
2706
+ }
2707
+
2521
2708
return Modified;
2522
2709
}
2523
2710
0 commit comments