@@ -201,6 +201,14 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201
201
// Find and merge a base register updates before or after a ld/st instruction.
202
202
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
203
203
204
+ // Finds and collapses loads of symmetric constant value.
205
+ bool tryFoldSymmetryConstantLoad (MachineBasicBlock::iterator &I,
206
+ unsigned Limit);
207
+ MachineBasicBlock::iterator
208
+ doFoldSymmetryConstantLoad (MachineInstr &MI,
209
+ SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
210
+ int SuccIndex, int Accumulated);
211
+
204
212
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205
213
206
214
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -2252,6 +2260,159 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2252
2260
return E;
2253
2261
}
2254
2262
2263
+ static bool isSymmetric (MachineInstr &MI, Register BaseReg) {
2264
+ auto MatchBaseReg = [&](unsigned Count) {
2265
+ for (unsigned I = 0 ; I < Count; I++) {
2266
+ auto OpI = MI.getOperand (I);
2267
+ if (OpI.isReg () && OpI.getReg () != BaseReg)
2268
+ return false ;
2269
+ }
2270
+ return true ;
2271
+ };
2272
+
2273
+ unsigned Opc = MI.getOpcode ();
2274
+ switch (Opc) {
2275
+ default :
2276
+ return false ;
2277
+ case AArch64::MOVZXi:
2278
+ return MatchBaseReg (1 );
2279
+ case AArch64::MOVKXi:
2280
+ return MatchBaseReg (2 );
2281
+ case AArch64::ORRXrs:
2282
+ MachineOperand &Imm = MI.getOperand (3 );
2283
+ // Fourth operand of ORR must be 32 which mean
2284
+ // 32bit symmetric constant load.
2285
+ // ex) renamable $x8 = ORRXrs $x8, $x8, 32
2286
+ if (MatchBaseReg (3 ) && Imm.isImm () && Imm.getImm () == 32 )
2287
+ return true ;
2288
+ }
2289
+
2290
+ return false ;
2291
+ }
2292
+
2293
+ MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad (
2294
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2295
+ int SuccIndex, int Accumulated) {
2296
+ MachineBasicBlock::iterator I = MI.getIterator ();
2297
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2298
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2299
+ MachineBasicBlock::iterator FirstMovI;
2300
+ MachineBasicBlock *MBB = MI.getParent ();
2301
+ uint64_t Mask = 0xFFFFUL ;
2302
+ int Index = 0 ;
2303
+
2304
+ for (auto MI = MIs.begin (), E = MIs.end (); MI != E; ++MI, Index++) {
2305
+ if (Index == SuccIndex - 1 ) {
2306
+ FirstMovI = *MI;
2307
+ break ;
2308
+ }
2309
+ (*MI)->eraseFromParent ();
2310
+ }
2311
+
2312
+ Register DstRegW =
2313
+ TRI->getSubReg (FirstMovI->getOperand (0 ).getReg (), AArch64::sub_32);
2314
+ int Lower = Accumulated & Mask;
2315
+ if (Lower) {
2316
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2317
+ TII->get (AArch64::MOVZWi), DstRegW)
2318
+ .addImm (Lower)
2319
+ .addImm (0 );
2320
+ Lower = Accumulated >> 16 & Mask;
2321
+ if (Lower) {
2322
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2323
+ TII->get (AArch64::MOVKWi), DstRegW)
2324
+ .addUse (DstRegW)
2325
+ .addImm (Lower)
2326
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2327
+ }
2328
+ } else {
2329
+ Lower = Accumulated >> 16 & Mask;
2330
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2331
+ TII->get (AArch64::MOVZWi), DstRegW)
2332
+ .addImm (Lower)
2333
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2334
+ }
2335
+ FirstMovI->eraseFromParent ();
2336
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2337
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp (MI);
2338
+ DstRegW = TRI->getSubReg (BaseReg, AArch64::sub_32);
2339
+ unsigned DstRegState = getRegState (MI.getOperand (0 ));
2340
+ BuildMI (*MBB, MI, MI.getDebugLoc (), TII->get (AArch64::STPWi))
2341
+ .addReg (DstRegW, DstRegState)
2342
+ .addReg (DstRegW, DstRegState)
2343
+ .addReg (MO.getReg (), getRegState (MO))
2344
+ .add (AArch64InstrInfo::getLdStOffsetOp (MI))
2345
+ .setMemRefs (MI.memoperands ())
2346
+ .setMIFlags (MI.getFlags ());
2347
+ I->eraseFromParent ();
2348
+
2349
+ return NextI;
2350
+ }
2351
+
2352
+ bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad (
2353
+ MachineBasicBlock::iterator &I, unsigned Limit) {
2354
+ MachineInstr &MI = *I;
2355
+ if (MI.getOpcode () != AArch64::STRXui)
2356
+ return false ;
2357
+
2358
+ MachineBasicBlock::iterator MBBI = I;
2359
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2360
+ if (MBBI == B)
2361
+ return false ;
2362
+
2363
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2364
+ unsigned Count = 0 , SuccIndex = 0 ;
2365
+ bool hasORR = false ;
2366
+ SmallVector<MachineBasicBlock::iterator> MIs;
2367
+ ModifiedRegUnits.clear ();
2368
+ UsedRegUnits.clear ();
2369
+
2370
+ uint64_t IValue, IShift, Accumulated = 0 , Mask = 0xFFFFUL ;
2371
+ do {
2372
+ MBBI = prev_nodbg (MBBI, B);
2373
+ MachineInstr &MI = *MBBI;
2374
+ if (!MI.isTransient ())
2375
+ ++Count;
2376
+ if (!isSymmetric (MI, BaseReg)) {
2377
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits,
2378
+ TRI);
2379
+ if (!ModifiedRegUnits.available (BaseReg) ||
2380
+ !UsedRegUnits.available (BaseReg))
2381
+ break ;
2382
+ continue ;
2383
+ }
2384
+
2385
+ unsigned Opc = MI.getOpcode ();
2386
+ if (Opc == AArch64::ORRXrs) {
2387
+ hasORR = true ;
2388
+ MIs.push_back (MBBI);
2389
+ continue ;
2390
+ }
2391
+ unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2 ;
2392
+ MachineOperand Value = MI.getOperand (ValueOrder);
2393
+ MachineOperand Shift = MI.getOperand (ValueOrder + 1 );
2394
+ if (!Value.isImm () || !Shift.isImm ())
2395
+ return false ;
2396
+
2397
+ IValue = Value.getImm ();
2398
+ IShift = Shift.getImm ();
2399
+ Accumulated -= (Accumulated & (Mask << IShift));
2400
+ Accumulated += (IValue << IShift);
2401
+ MIs.push_back (MBBI);
2402
+ if ((Accumulated != 0 ) &&
2403
+ (((Accumulated >> 32 ) == (Accumulated & 0xffffffffULL )) ||
2404
+ ((hasORR && Accumulated >> 32 == 0 ))))
2405
+ SuccIndex = MIs.size ();
2406
+ } while (MBBI != B && Count < Limit);
2407
+
2408
+ if (SuccIndex) {
2409
+ I = doFoldSymmetryConstantLoad (MI, MIs, SuccIndex, Accumulated);
2410
+ return true ;
2411
+ }
2412
+
2413
+ return false ;
2414
+ }
2415
+
2255
2416
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2256
2417
MachineBasicBlock::iterator &MBBI) {
2257
2418
MachineInstr &MI = *MBBI;
@@ -2518,6 +2679,26 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2518
2679
++MBBI;
2519
2680
}
2520
2681
2682
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
2683
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
2684
+ // us to load a 32-bit value only once.
2685
+ // Considering :
2686
+ // renamable $x8 = MOVZXi 49370, 0
2687
+ // renamable $x8 = MOVKXi $x8, 320, 16
2688
+ // renamable $x8 = ORRXrs $x8, $x8, 32
2689
+ // STRXui killed renamable $x8, killed renamable $x0, 0
2690
+ // Transform :
2691
+ // $w8 = MOVZWi 49370, 0
2692
+ // $w8 = MOVKWi $w8, 320, 16
2693
+ // STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
2694
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2695
+ MBBI != E;) {
2696
+ if (tryFoldSymmetryConstantLoad (MBBI, UpdateLimit))
2697
+ Modified = true ;
2698
+ else
2699
+ ++MBBI;
2700
+ }
2701
+
2521
2702
return Modified;
2522
2703
}
2523
2704
0 commit comments