@@ -201,6 +201,14 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201
201
// Find and merge a base register updates before or after a ld/st instruction.
202
202
bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
203
203
204
+ // Finds and collapses loads of symmetric constant value.
205
+ bool tryFoldSymmetryConstantLoad (MachineBasicBlock::iterator &I,
206
+ unsigned Limit);
207
+ MachineBasicBlock::iterator
208
+ doFoldSymmetryConstantLoad (MachineInstr &MI,
209
+ SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
210
+ int SuccIndex, bool hasORR, int Accumulated);
211
+
204
212
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205
213
206
214
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -2252,6 +2260,167 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2252
2260
return E;
2253
2261
}
2254
2262
2263
+ static bool isSymmetric (MachineInstr &MI, Register BaseReg) {
2264
+ auto MatchBaseReg = [&](unsigned Count) {
2265
+ for (unsigned I = 0 ; I < Count; I++) {
2266
+ auto OpI = MI.getOperand (I);
2267
+ if (OpI.isReg () && OpI.getReg () != BaseReg)
2268
+ return false ;
2269
+ }
2270
+ return true ;
2271
+ };
2272
+
2273
+ unsigned Opc = MI.getOpcode ();
2274
+ switch (Opc) {
2275
+ default :
2276
+ return false ;
2277
+ case AArch64::MOVZXi:
2278
+ return MatchBaseReg (1 );
2279
+ case AArch64::MOVKXi:
2280
+ return MatchBaseReg (2 );
2281
+ case AArch64::ORRXrs:
2282
+ MachineOperand &Imm = MI.getOperand (3 );
2283
+ // Fourth operand of ORR must be 32 which mean
2284
+ // 32bit symmetric constant load.
2285
+ // ex) renamable $x8 = ORRXrs $x8, $x8, 32
2286
+ if (MatchBaseReg (3 ) && Imm.isImm () && Imm.getImm () == 32 )
2287
+ return true ;
2288
+ }
2289
+
2290
+ return false ;
2291
+ }
2292
+
2293
+ MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad (
2294
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2295
+ int SuccIndex, bool hasORR, int Accumulated) {
2296
+ MachineBasicBlock::iterator I = MI.getIterator ();
2297
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2298
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2299
+ MachineBasicBlock::iterator FirstMovI;
2300
+ MachineBasicBlock *MBB = MI.getParent ();
2301
+ uint64_t Mask = 0xFFFFUL ;
2302
+ Register DstRegW;
2303
+
2304
+ if (hasORR) {
2305
+ (*MIs.begin ())->eraseFromParent ();
2306
+ } else {
2307
+ int Index = 0 ;
2308
+ for (auto MI = MIs.begin (), E = MIs.end (); MI != E; ++MI, Index++) {
2309
+ if (Index == SuccIndex - 1 ) {
2310
+ FirstMovI = *MI;
2311
+ break ;
2312
+ }
2313
+ (*MI)->eraseFromParent ();
2314
+ }
2315
+ DstRegW =
2316
+ TRI->getSubReg (FirstMovI->getOperand (0 ).getReg (), AArch64::sub_32);
2317
+
2318
+ int Lower = Accumulated & Mask;
2319
+ if (Lower) {
2320
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2321
+ TII->get (AArch64::MOVZWi), DstRegW)
2322
+ .addImm (Lower)
2323
+ .addImm (0 );
2324
+ Lower = (Accumulated >> 16 ) & Mask;
2325
+ if (Lower) {
2326
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2327
+ TII->get (AArch64::MOVKWi), DstRegW)
2328
+ .addUse (DstRegW)
2329
+ .addImm (Lower)
2330
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2331
+ }
2332
+ } else {
2333
+ Lower = Accumulated >> 16 & Mask;
2334
+ BuildMI (*MBB, FirstMovI, FirstMovI->getDebugLoc (),
2335
+ TII->get (AArch64::MOVZWi), DstRegW)
2336
+ .addImm (Lower)
2337
+ .addImm (AArch64_AM::getShifterImm (AArch64_AM::LSL, 16 ));
2338
+ }
2339
+ FirstMovI->eraseFromParent ();
2340
+ }
2341
+
2342
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2343
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp (MI);
2344
+ DstRegW = TRI->getSubReg (BaseReg, AArch64::sub_32);
2345
+ unsigned DstRegState = getRegState (MI.getOperand (0 ));
2346
+ BuildMI (*MBB, MI, MI.getDebugLoc (), TII->get (AArch64::STPWi))
2347
+ .addReg (DstRegW, DstRegState)
2348
+ .addReg (DstRegW, DstRegState)
2349
+ .addReg (MO.getReg (), getRegState (MO))
2350
+ .add (AArch64InstrInfo::getLdStOffsetOp (MI))
2351
+ .setMemRefs (MI.memoperands ())
2352
+ .setMIFlags (MI.getFlags ());
2353
+ I->eraseFromParent ();
2354
+
2355
+ return NextI;
2356
+ }
2357
+
2358
+ bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad (
2359
+ MachineBasicBlock::iterator &I, unsigned Limit) {
2360
+ MachineInstr &MI = *I;
2361
+ if (MI.getOpcode () != AArch64::STRXui)
2362
+ return false ;
2363
+
2364
+ MachineBasicBlock::iterator MBBI = I;
2365
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2366
+ if (MBBI == B)
2367
+ return false ;
2368
+
2369
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2370
+ unsigned Count = 0 , SuccIndex = 0 ;
2371
+ bool hasORR = false ;
2372
+ SmallVector<MachineBasicBlock::iterator> MIs;
2373
+ ModifiedRegUnits.clear ();
2374
+ UsedRegUnits.clear ();
2375
+
2376
+ uint64_t Accumulated = 0 , Mask = 0xFFFFUL ;
2377
+ do {
2378
+ MBBI = prev_nodbg (MBBI, B);
2379
+ MachineInstr &MI = *MBBI;
2380
+ if (!MI.isTransient ())
2381
+ ++Count;
2382
+ if (!isSymmetric (MI, BaseReg)) {
2383
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits,
2384
+ TRI);
2385
+ if (!ModifiedRegUnits.available (BaseReg) ||
2386
+ !UsedRegUnits.available (BaseReg))
2387
+ return false ;
2388
+ continue ;
2389
+ }
2390
+
2391
+ unsigned Opc = MI.getOpcode ();
2392
+ if (Opc == AArch64::ORRXrs) {
2393
+ hasORR = true ;
2394
+ MIs.push_back (MBBI);
2395
+ continue ;
2396
+ }
2397
+ unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2 ;
2398
+ MachineOperand Value = MI.getOperand (ValueOrder);
2399
+ MachineOperand Shift = MI.getOperand (ValueOrder + 1 );
2400
+ if (!Value.isImm () || !Shift.isImm ())
2401
+ return false ;
2402
+
2403
+ uint64_t IValue = Value.getImm ();
2404
+ uint64_t IShift = Shift.getImm ();
2405
+ Accumulated -= (Accumulated & (Mask << IShift));
2406
+ Accumulated += (IValue << IShift);
2407
+ MIs.push_back (MBBI);
2408
+ if (Accumulated != 0 &&
2409
+ (((Accumulated >> 32 ) == (Accumulated & 0xffffffffULL )) ||
2410
+ (hasORR && Accumulated >> 32 == 0 ))) {
2411
+ SuccIndex = MIs.size ();
2412
+ break ;
2413
+ }
2414
+ } while (MBBI != B && Count < Limit);
2415
+
2416
+ if (SuccIndex) {
2417
+ I = doFoldSymmetryConstantLoad (MI, MIs, SuccIndex, hasORR, Accumulated);
2418
+ return true ;
2419
+ }
2420
+
2421
+ return false ;
2422
+ }
2423
+
2255
2424
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2256
2425
MachineBasicBlock::iterator &MBBI) {
2257
2426
MachineInstr &MI = *MBBI;
@@ -2518,6 +2687,26 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2518
2687
++MBBI;
2519
2688
}
2520
2689
2690
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
2691
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
2692
+ // us to load a 32-bit value only once.
2693
+ // Considering :
2694
+ // renamable $x8 = MOVZXi 49370, 0
2695
+ // renamable $x8 = MOVKXi $x8, 320, 16
2696
+ // renamable $x8 = ORRXrs $x8, $x8, 32
2697
+ // STRXui killed renamable $x8, killed renamable $x0, 0
2698
+ // Transform :
2699
+ // $w8 = MOVZWi 49370, 0
2700
+ // $w8 = MOVKWi $w8, 320, 16
2701
+ // STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
2702
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2703
+ MBBI != E;) {
2704
+ if (tryFoldSymmetryConstantLoad (MBBI, UpdateLimit))
2705
+ Modified = true ;
2706
+ else
2707
+ ++MBBI;
2708
+ }
2709
+
2521
2710
return Modified;
2522
2711
}
2523
2712
0 commit comments