@@ -226,6 +226,14 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
226
226
// Find and merge an index ldr/st instruction into a base ld/st instruction.
227
227
bool tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI, int Scale);
228
228
229
+ // Finds and collapses loads of symmetric constant value.
230
+ bool tryFoldSymmetryConstantLoad (MachineBasicBlock::iterator &I,
231
+ unsigned Limit);
232
+ MachineBasicBlock::iterator
233
+ doFoldSymmetryConstantLoad (MachineInstr &MI,
234
+ SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
235
+ int UpperLoadIdx, int Accumulated);
236
+
229
237
bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
230
238
231
239
bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -2443,6 +2451,155 @@ AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2443
2451
return E;
2444
2452
}
2445
2453
2454
+ static bool isSymmetricLoadCandidate (MachineInstr &MI, Register BaseReg) {
2455
+ auto MatchBaseReg = [&](unsigned Count) {
2456
+ for (unsigned I = 0 ; I < Count; I++) {
2457
+ auto OpI = MI.getOperand (I);
2458
+ if (OpI.isReg () && OpI.getReg () != BaseReg)
2459
+ return false ;
2460
+ }
2461
+ return true ;
2462
+ };
2463
+
2464
+ unsigned Opc = MI.getOpcode ();
2465
+ switch (Opc) {
2466
+ default :
2467
+ return false ;
2468
+ case AArch64::MOVZXi:
2469
+ return MatchBaseReg (1 );
2470
+ case AArch64::MOVKXi:
2471
+ return MatchBaseReg (2 );
2472
+ case AArch64::ORRXrs:
2473
+ MachineOperand &Imm = MI.getOperand (3 );
2474
+ // Fourth operand of ORR must be 32 which mean
2475
+ // 32bit symmetric constant load.
2476
+ // ex) renamable $x8 = ORRXrs $x8, $x8, 32
2477
+ if (MatchBaseReg (3 ) && Imm.isImm () && Imm.getImm () == 32 )
2478
+ return true ;
2479
+ }
2480
+
2481
+ return false ;
2482
+ }
2483
+
2484
+ MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad (
2485
+ MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
2486
+ int UpperLoadIdx, int Accumulated) {
2487
+ MachineBasicBlock::iterator I = MI.getIterator ();
2488
+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2489
+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2490
+ MachineBasicBlock *MBB = MI.getParent ();
2491
+
2492
+ if (!UpperLoadIdx) {
2493
+ // ORR ensures that previous instructions load lower 32-bit constants.
2494
+ // Remove ORR only.
2495
+ (*MIs.begin ())->eraseFromParent ();
2496
+ } else {
2497
+ // We need to remove MOV for upper of 32bit because we know these instrs
2498
+ // is part of symmetric constant.
2499
+ int Index = 0 ;
2500
+ for (auto MI = MIs.begin (); Index < UpperLoadIdx; ++MI, Index++) {
2501
+ (*MI)->eraseFromParent ();
2502
+ }
2503
+ }
2504
+
2505
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2506
+ const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp (MI);
2507
+ Register DstRegW = TRI->getSubReg (BaseReg, AArch64::sub_32);
2508
+ unsigned DstRegState = getRegState (MI.getOperand (0 ));
2509
+ int Offset = AArch64InstrInfo::getLdStOffsetOp (MI).getImm ();
2510
+ BuildMI (*MBB, MI, MI.getDebugLoc (), TII->get (AArch64::STPWi))
2511
+ .addReg (DstRegW, DstRegState)
2512
+ .addReg (DstRegW, DstRegState)
2513
+ .addReg (MO.getReg (), getRegState (MO))
2514
+ .addImm (Offset * 2 )
2515
+ .setMemRefs (MI.memoperands ())
2516
+ .setMIFlags (MI.getFlags ());
2517
+ I->eraseFromParent ();
2518
+ return NextI;
2519
+ }
2520
+
2521
+ bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad (
2522
+ MachineBasicBlock::iterator &I, unsigned Limit) {
2523
+ MachineInstr &MI = *I;
2524
+ if (MI.getOpcode () != AArch64::STRXui)
2525
+ return false ;
2526
+
2527
+ MachineBasicBlock::iterator MBBI = I;
2528
+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2529
+ if (MBBI == B)
2530
+ return false ;
2531
+
2532
+ TypeSize Scale (0U , false ), Width (0U , false );
2533
+ int64_t MinOffset, MaxOffset;
2534
+ if (!AArch64InstrInfo::getMemOpInfo (AArch64::STPWi, Scale, Width, MinOffset,
2535
+ MaxOffset))
2536
+ return false ;
2537
+
2538
+ // We replace the STRX instruction, which stores 64 bits, with the STPW
2539
+ // instruction, which stores two consecutive 32 bits. Therefore, we compare
2540
+ // the offset range with multiplied by two.
2541
+ int Offset = AArch64InstrInfo::getLdStOffsetOp (MI).getImm ();
2542
+ if (Offset * 2 < MinOffset || Offset * 2 > MaxOffset)
2543
+ return false ;
2544
+
2545
+ Register BaseReg = getLdStRegOp (MI).getReg ();
2546
+ unsigned Count = 0 , UpperLoadIdx = 0 ;
2547
+ uint64_t Accumulated = 0 , Mask = 0xFFFFUL ;
2548
+ bool hasORR = false , Found = false ;
2549
+ SmallVector<MachineBasicBlock::iterator> MIs;
2550
+ ModifiedRegUnits.clear ();
2551
+ UsedRegUnits.clear ();
2552
+ do {
2553
+ MBBI = prev_nodbg (MBBI, B);
2554
+ MachineInstr &MI = *MBBI;
2555
+ if (!MI.isTransient ())
2556
+ ++Count;
2557
+ if (!isSymmetricLoadCandidate (MI, BaseReg)) {
2558
+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits,
2559
+ TRI);
2560
+ if (!ModifiedRegUnits.available (BaseReg) ||
2561
+ !UsedRegUnits.available (BaseReg))
2562
+ return false ;
2563
+ continue ;
2564
+ }
2565
+
2566
+ unsigned Opc = MI.getOpcode ();
2567
+ if (Opc == AArch64::ORRXrs) {
2568
+ hasORR = true ;
2569
+ MIs.push_back (MBBI);
2570
+ continue ;
2571
+ }
2572
+ unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2 ;
2573
+ MachineOperand Value = MI.getOperand (ValueOrder);
2574
+ MachineOperand Shift = MI.getOperand (ValueOrder + 1 );
2575
+ if (!Value.isImm () || !Shift.isImm ())
2576
+ return false ;
2577
+
2578
+ uint64_t IValue = Value.getImm ();
2579
+ uint64_t IShift = Shift.getImm ();
2580
+ uint64_t Adder = IValue << IShift;
2581
+ MIs.push_back (MBBI);
2582
+ if (Adder >> 32 )
2583
+ UpperLoadIdx = MIs.size ();
2584
+
2585
+ Accumulated -= Accumulated & (Mask << IShift);
2586
+ Accumulated += Adder;
2587
+ if (Accumulated != 0 &&
2588
+ (((Accumulated >> 32 ) == (Accumulated & 0xffffffffULL )) ||
2589
+ (hasORR && (Accumulated >> 32 == 0 )))) {
2590
+ Found = true ;
2591
+ break ;
2592
+ }
2593
+ } while (MBBI != B && Count < Limit);
2594
+
2595
+ if (Found) {
2596
+ I = doFoldSymmetryConstantLoad (MI, MIs, UpperLoadIdx, Accumulated);
2597
+ return true ;
2598
+ }
2599
+
2600
+ return false ;
2601
+ }
2602
+
2446
2603
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
2447
2604
MachineBasicBlock::iterator &MBBI) {
2448
2605
MachineInstr &MI = *MBBI;
@@ -2753,6 +2910,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2753
2910
++MBBI;
2754
2911
}
2755
2912
2913
+ // We have an opportunity to optimize the `STRXui` instruction, which loads
2914
+ // the same 32-bit value into a register twice. The `STPXi` instruction allows
2915
+ // us to load a 32-bit value only once.
2916
+ // Considering :
2917
+ // renamable $x8 = MOVZXi 49370, 0
2918
+ // renamable $x8 = MOVKXi $x8, 320, 16
2919
+ // renamable $x8 = ORRXrs $x8, $x8, 32
2920
+ // STRXui killed renamable $x8, killed renamable $x0, 0
2921
+ // Transform :
2922
+ // $w8 = MOVZWi 49370, 0
2923
+ // $w8 = MOVKWi $w8, 320, 16
2924
+ // STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
2925
+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2926
+ MBBI != E;) {
2927
+ if (isMergeableLdStUpdate (*MBBI) &&
2928
+ tryFoldSymmetryConstantLoad (MBBI, UpdateLimit))
2929
+ Modified = true ;
2930
+ else
2931
+ ++MBBI;
2932
+ }
2933
+
2756
2934
return Modified;
2757
2935
}
2758
2936
0 commit comments