Skip to content

Commit b24af43

Browse files
authored
[AArch64] Improve scheduling latency into Bundles (llvm#86310)
By default the scheduling info of instructions into a BUNDLE are given a latency of 0 as they operate on the implicit register of the bundle. This modifies that for AArch64 so that the latency is adjusted to use the latency from the instruction in the bundle instead. This essentially assumes that the bundled instructions are executed in a single cycle, which for AArch64 is probably OK considering they are mostly used for MOVPFX bundles, where this can help create slightly better scheduling especially for in-order cores.
1 parent f5b2d24 commit b24af43

36 files changed

+377
-327
lines changed

llvm/include/llvm/CodeGen/TargetSubtargetInfo.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
235235
// and UseOpIdx are the indices of the operands in Def and Use, respectively.
236236
// Otherwise, either may be -1.
237237
virtual void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
238-
int UseOpIdx, SDep &Dep) const {}
238+
int UseOpIdx, SDep &Dep,
239+
const TargetSchedModel *SchedModel) const {
240+
}
239241

240242
// For use with PostRAScheduling: get the anti-dependence breaking that should
241243
// be performed before post-RA scheduling.

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -919,7 +919,8 @@ void SwingSchedulerDAG::updatePhiDependences() {
919919
if (!MI->isPHI()) {
920920
SDep Dep(SU, SDep::Data, Reg);
921921
Dep.setLatency(0);
922-
ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep);
922+
ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
923+
&SchedModel);
923924
I.addPred(Dep);
924925
} else {
925926
HasPhiUse = Reg;

llvm/lib/CodeGen/ScheduleDAGInstrs.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
282282
} else {
283283
Dep.setLatency(0);
284284
}
285-
ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOpIdx, Dep);
285+
ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOpIdx, Dep, &SchedModel);
286286
UseSU->addPred(Dep);
287287
}
288288
}
@@ -323,7 +323,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
323323
Dep.setLatency(
324324
SchedModel.computeOutputLatency(MI, OperIdx, DefInstr));
325325
}
326-
ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep);
326+
ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep,
327+
&SchedModel);
327328
DefSU->addPred(Dep);
328329
}
329330
}
@@ -453,7 +454,8 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
453454
SDep Dep(SU, SDep::Data, Reg);
454455
Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
455456
I->OperandIndex));
456-
ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep);
457+
ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep,
458+
&SchedModel);
457459
UseSU->addPred(Dep);
458460
}
459461

llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
512512
Dep.setLatency(OpLatency);
513513
if (!isChain && !UnitLatencies) {
514514
computeOperandLatency(OpN, N, i, Dep);
515-
ST.adjustSchedDependency(OpSU, DefIdx, &SU, i, Dep);
515+
ST.adjustSchedDependency(OpSU, DefIdx, &SU, i, Dep, nullptr);
516516
}
517517

518518
if (!SU.addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,45 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
472472
Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
473473
}
474474

475+
void AArch64Subtarget::adjustSchedDependency(
476+
SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
477+
const TargetSchedModel *SchedModel) const {
478+
if (!SchedModel || Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
479+
!Def->isInstr() || !Use->isInstr() ||
480+
(Def->getInstr()->getOpcode() != TargetOpcode::BUNDLE &&
481+
Use->getInstr()->getOpcode() != TargetOpcode::BUNDLE))
482+
return;
483+
484+
// If the Def is a BUNDLE, find the last instruction in the bundle that defs
485+
// the register.
486+
const MachineInstr *DefMI = Def->getInstr();
487+
if (DefMI->getOpcode() == TargetOpcode::BUNDLE) {
488+
Register Reg = DefMI->getOperand(DefOpIdx).getReg();
489+
for (const auto &Op : const_mi_bundle_ops(*DefMI)) {
490+
if (Op.isReg() && Op.isDef() && Op.getReg() == Reg) {
491+
DefMI = Op.getParent();
492+
DefOpIdx = Op.getOperandNo();
493+
}
494+
}
495+
}
496+
497+
// If the Use is a BUNDLE, find the first instruction that uses the Reg.
498+
const MachineInstr *UseMI = Use->getInstr();
499+
if (UseMI->getOpcode() == TargetOpcode::BUNDLE) {
500+
Register Reg = UseMI->getOperand(UseOpIdx).getReg();
501+
for (const auto &Op : const_mi_bundle_ops(*UseMI)) {
502+
if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) {
503+
UseMI = Op.getParent();
504+
UseOpIdx = Op.getOperandNo();
505+
break;
506+
}
507+
}
508+
}
509+
510+
Dep.setLatency(
511+
SchedModel->computeOperandLatency(DefMI, DefOpIdx, UseMI, UseOpIdx));
512+
}
513+
475514
bool AArch64Subtarget::enableEarlyIfConversion() const {
476515
return EnableEarlyIfConvert;
477516
}

llvm/lib/Target/AArch64/AArch64Subtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
354354

355355
void overrideSchedPolicy(MachineSchedPolicy &Policy,
356356
unsigned NumRegionInstrs) const override;
357+
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
358+
SDep &Dep,
359+
const TargetSchedModel *SchedModel) const override;
357360

358361
bool enableEarlyIfConversion() const override;
359362

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -860,8 +860,9 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
860860
return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
861861
}
862862

863-
void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
864-
int UseOpIdx, SDep &Dep) const {
863+
void GCNSubtarget::adjustSchedDependency(
864+
SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
865+
const TargetSchedModel *SchedModel) const {
865866
if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
866867
!Def->isInstr() || !Use->isInstr())
867868
return;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1500,7 +1500,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
15001500
}
15011501

15021502
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1503-
SDep &Dep) const override;
1503+
SDep &Dep,
1504+
const TargetSchedModel *SchedModel) const override;
15041505

15051506
// \returns true if it's beneficial on this subtarget for the scheduler to
15061507
// cluster stores as well as loads.

llvm/lib/Target/Hexagon/HexagonSubtarget.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -437,9 +437,9 @@ bool HexagonSubtarget::useAA() const {
437437

438438
/// Perform target specific adjustments to the latency of a schedule
439439
/// dependency.
440-
void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx,
441-
SUnit *Dst, int DstOpIdx,
442-
SDep &Dep) const {
440+
void HexagonSubtarget::adjustSchedDependency(
441+
SUnit *Src, int SrcOpIdx, SUnit *Dst, int DstOpIdx, SDep &Dep,
442+
const TargetSchedModel *SchedModel) const {
443443
if (!Src->isInstr() || !Dst->isInstr())
444444
return;
445445

llvm/lib/Target/Hexagon/HexagonSubtarget.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,8 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
308308
/// Perform target specific adjustments to the latency of a schedule
309309
/// dependency.
310310
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
311-
SDep &Dep) const override;
311+
SDep &Dep,
312+
const TargetSchedModel *SchedModel) const override;
312313

313314
unsigned getVectorLength() const {
314315
assert(useHVXOps());

llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,21 +190,21 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
190190
; CHECK: // %bb.0: // %entry
191191
; CHECK-NEXT: uzp1 z24.d, z2.d, z3.d
192192
; CHECK-NEXT: uzp2 z25.d, z0.d, z1.d
193-
; CHECK-NEXT: ptrue p0.d
194193
; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d
194+
; CHECK-NEXT: ptrue p0.d
195195
; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d
196196
; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d
197197
; CHECK-NEXT: fmul z1.d, z24.d, z25.d
198198
; CHECK-NEXT: fmul z3.d, z2.d, z25.d
199199
; CHECK-NEXT: uzp2 z25.d, z4.d, z5.d
200200
; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
201201
; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d
202-
; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z0.d
203202
; CHECK-NEXT: fmla z3.d, p0/m, z26.d, z25.d
203+
; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z0.d
204204
; CHECK-NEXT: movprfx z2, z3
205205
; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d
206-
; CHECK-NEXT: fnmls z2.d, p0/m, z24.d, z0.d
207206
; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d
207+
; CHECK-NEXT: fnmls z2.d, p0/m, z24.d, z0.d
208208
; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z25.d
209209
; CHECK-NEXT: zip1 z0.d, z2.d, z1.d
210210
; CHECK-NEXT: zip2 z1.d, z2.d, z1.d

llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1145,8 +1145,8 @@ define <vscale x 4 x i64> @fshl_rot_illegal_i64(<vscale x 4 x i64> %a, <vscale x
11451145
; CHECK-NEXT: subr z3.d, z3.d, #0 // =0x0
11461146
; CHECK-NEXT: and z4.d, z4.d, #0x3f
11471147
; CHECK-NEXT: and z2.d, z2.d, #0x3f
1148-
; CHECK-NEXT: and z3.d, z3.d, #0x3f
11491148
; CHECK-NEXT: and z5.d, z5.d, #0x3f
1149+
; CHECK-NEXT: and z3.d, z3.d, #0x3f
11501150
; CHECK-NEXT: lslr z4.d, p0/m, z4.d, z0.d
11511151
; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d
11521152
; CHECK-NEXT: movprfx z2, z1

llvm/test/CodeGen/AArch64/misched-bundle.mir

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@
4646
# CHECK-NEXT: # rdefs left : 0
4747
# CHECK-NEXT: Latency : 3
4848
# CHECK-NEXT: Depth : 0
49-
# CHECK-NEXT: Height : 0
49+
# CHECK-NEXT: Height : 7
5050
# CHECK-NEXT: Successors:
51-
# CHECK-NEXT: SU(7): Data Latency=0 Reg=$z3
51+
# CHECK-NEXT: SU(7): Data Latency=3 Reg=$z3
5252
# CHECK-NEXT: SU(9): Ord Latency=0 Memory
5353
# CHECK-NEXT: SU(8): Ord Latency=0 Memory
5454
# CHECK-NEXT: Single Issue : false;
@@ -58,9 +58,9 @@
5858
# CHECK-NEXT: # rdefs left : 0
5959
# CHECK-NEXT: Latency : 3
6060
# CHECK-NEXT: Depth : 0
61-
# CHECK-NEXT: Height : 0
61+
# CHECK-NEXT: Height : 7
6262
# CHECK-NEXT: Successors:
63-
# CHECK-NEXT: SU(7): Data Latency=0 Reg=$z4
63+
# CHECK-NEXT: SU(7): Data Latency=3 Reg=$z4
6464
# CHECK-NEXT: SU(9): Ord Latency=0 Memory
6565
# CHECK-NEXT: SU(8): Ord Latency=0 Memory
6666
# CHECK-NEXT: Single Issue : false;
@@ -70,9 +70,9 @@
7070
# CHECK-NEXT: # rdefs left : 0
7171
# CHECK-NEXT: Latency : 3
7272
# CHECK-NEXT: Depth : 0
73-
# CHECK-NEXT: Height : 0
73+
# CHECK-NEXT: Height : 7
7474
# CHECK-NEXT: Successors:
75-
# CHECK-NEXT: SU(7): Data Latency=0 Reg=$z5
75+
# CHECK-NEXT: SU(7): Data Latency=3 Reg=$z5
7676
# CHECK-NEXT: SU(9): Ord Latency=0 Memory
7777
# CHECK-NEXT: SU(8): Ord Latency=0 Memory
7878
# CHECK-NEXT: Single Issue : false;
@@ -98,15 +98,15 @@
9898
# CHECK-NEXT: # rdefs left : 0
9999
# CHECK-NEXT: Latency : 1
100100
# CHECK-NEXT: Depth : 3
101-
# CHECK-NEXT: Height : 0
101+
# CHECK-NEXT: Height : 4
102102
# CHECK-NEXT: Predecessors:
103103
# CHECK-NEXT: SU(6): Anti Latency=0
104-
# CHECK-NEXT: SU(5): Data Latency=0 Reg=$z5
105-
# CHECK-NEXT: SU(4): Data Latency=0 Reg=$z4
106-
# CHECK-NEXT: SU(3): Data Latency=0 Reg=$z3
104+
# CHECK-NEXT: SU(5): Data Latency=3 Reg=$z5
105+
# CHECK-NEXT: SU(4): Data Latency=3 Reg=$z4
106+
# CHECK-NEXT: SU(3): Data Latency=3 Reg=$z3
107107
# CHECK-NEXT: SU(1): Out Latency=1
108108
# CHECK-NEXT: Successors:
109-
# CHECK-NEXT: SU(9): Data Latency=0 Reg=$z1
109+
# CHECK-NEXT: SU(9): Data Latency=4 Reg=$z1
110110
# CHECK-NEXT: Single Issue : false;
111111
# CHECK-NEXT: SU(8): ST1H killed renamable $z0, renamable $p0, renamable $x0, renamable $x10 :: (store unknown-size, align 1)
112112
# CHECK-NEXT: # preds left : 7
@@ -135,7 +135,7 @@
135135
# CHECK-NEXT: Height : 0
136136
# CHECK-NEXT: Predecessors:
137137
# CHECK-NEXT: SU(8): Ord Latency=0 Memory
138-
# CHECK-NEXT: SU(7): Data Latency=0 Reg=$z1
138+
# CHECK-NEXT: SU(7): Data Latency=4 Reg=$z1
139139
# CHECK-NEXT: SU(5): Ord Latency=0 Memory
140140
# CHECK-NEXT: SU(4): Ord Latency=0 Memory
141141
# CHECK-NEXT: SU(3): Ord Latency=0 Memory
@@ -159,24 +159,24 @@ body: |
159159
bb.0.entry:
160160
liveins: $p0, $x0, $x1, $x2, $x10, $x11, $x12, $x13
161161
162+
162163
; CHECK-LABEL: name: test
163164
; CHECK: liveins: $p0, $x0, $x1, $x2, $x10, $x11, $x12, $x13
164165
; CHECK-NEXT: {{ $}}
165166
; CHECK-NEXT: renamable $z0 = LD1H renamable $p0, renamable $x1, renamable $x10 :: (load unknown-size, align 1)
166167
; CHECK-NEXT: renamable $z1 = LD1H renamable $p0, renamable $x2, renamable $x10 :: (load unknown-size, align 1)
167168
; CHECK-NEXT: renamable $z2 = LD1H renamable $p0, renamable $x0, renamable $x10 :: (load unknown-size, align 1)
168-
; CHECK-NEXT: $z0 = FMAD_ZPmZZ_H renamable $p0, killed $z0, renamable $z1, killed renamable $z2
169169
; CHECK-NEXT: renamable $z3 = LD1H renamable $p0, renamable $x11, renamable $x10 :: (load unknown-size, align 1)
170170
; CHECK-NEXT: renamable $z4 = LD1H renamable $p0, renamable $x12, renamable $x10 :: (load unknown-size, align 1)
171171
; CHECK-NEXT: renamable $z5 = LD1H renamable $p0, renamable $x13, renamable $x10 :: (load unknown-size, align 1)
172-
; CHECK-NEXT: ST1H killed renamable $z0, renamable $p0, renamable $x0, renamable $x10 :: (store unknown-size, align 1)
173-
; CHECK-NEXT: BUNDLE implicit-def $z1, implicit-def $q1, implicit-def $d1, implicit-def $s1, implicit-def $h1, implicit-def $b1, implicit $z5, implicit $p0, implicit $z4, implicit $z3 {
172+
; CHECK-NEXT: $z0 = FMAD_ZPmZZ_H renamable $p0, killed $z0, killed renamable $z1, killed renamable $z2
173+
; CHECK-NEXT: BUNDLE implicit-def $z1, implicit-def $q1, implicit-def $d1, implicit-def $s1, implicit-def $h1, implicit-def $b1, implicit $z5, implicit $p0, implicit killed $z4, implicit killed $z3 {
174174
; CHECK-NEXT: $z1 = MOVPRFX_ZZ $z5
175-
; CHECK-NEXT: $z1 = FMLA_ZPmZZ_H renamable $p0, internal $z1, renamable $z4, renamable $z3
175+
; CHECK-NEXT: $z1 = FMLA_ZPmZZ_H renamable $p0, internal killed $z1, killed renamable $z4, killed renamable $z3
176176
; CHECK-NEXT: }
177-
; CHECK-NEXT: ST1H renamable $z1, renamable $p0, renamable $x13, renamable $x10 :: (store unknown-size, align 1)
177+
; CHECK-NEXT: ST1H killed renamable $z0, renamable $p0, renamable $x0, renamable $x10 :: (store unknown-size, align 1)
178+
; CHECK-NEXT: ST1H killed renamable $z1, renamable $p0, renamable $x13, renamable $x10 :: (store unknown-size, align 1)
178179
; CHECK-NEXT: RET_ReallyLR
179-
180180
renamable $z0 = LD1H renamable $p0, renamable $x1, renamable $x10 :: (load unknown-size)
181181
renamable $z1 = LD1H renamable $p0, renamable $x2, renamable $x10 :: (load unknown-size)
182182
renamable $z2 = LD1H renamable $p0, renamable $x0, renamable $x10 :: (load unknown-size)

llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 {
4343
; VBITS_GE_256-LABEL: build_vector_minus2_dec32_v4i64:
4444
; VBITS_GE_256: // %bb.0:
4545
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
46-
; VBITS_GE_256-NEXT: mov x8, #-32
46+
; VBITS_GE_256-NEXT: mov x8, #-32 // =0xffffffffffffffe0
4747
; VBITS_GE_256-NEXT: index z0.d, #-2, x8
4848
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
4949
; VBITS_GE_256-NEXT: ret

0 commit comments

Comments
 (0)