Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 8a6498e

Browse files
author
Jinsong Ji
committed
[PowerPC] Enable MachinePipeliner for P9 with -ppc-enable-pipeliner
Implement necessary target hooks to enable MachinePipeliner for P9 only. The pass is off by default, can be enabled with -ppc-enable-pipeliner for P9. Differential Revision: https://reviews.llvm.org/D62164 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363085 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 31d8a65 commit 8a6498e

File tree

12 files changed

+227
-19
lines changed

12 files changed

+227
-19
lines changed

include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -318,9 +318,9 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
318318
MBBVectorTy &EpilogBBs);
319319
void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
320320
SMSchedule &Schedule);
321-
void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
322-
MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
323-
ValueMapTy *VRMap);
321+
void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
322+
MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
323+
SMSchedule &Schedule, ValueMapTy *VRMap);
324324
bool computeDelta(MachineInstr &MI, unsigned &Delta);
325325
void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
326326
unsigned Num);

include/llvm/CodeGen/TargetInstrInfo.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -670,8 +670,9 @@ class TargetInstrInfo : public MCInstrInfo {
670670
/// is finished. Return the value/register of the new loop count. We need
671671
/// this function when peeling off one or more iterations of a loop. This
672672
/// function assumes the nth iteration is peeled first.
673-
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineInstr *IndVar,
674-
MachineInstr &Cmp,
673+
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
674+
MachineBasicBlock &PreHeader,
675+
MachineInstr *IndVar, MachineInstr &Cmp,
675676
SmallVectorImpl<MachineOperand> &Cond,
676677
SmallVectorImpl<MachineInstr *> &PrevInsts,
677678
unsigned Iter, unsigned MaxIter) const {

include/llvm/CodeGen/TargetSubtargetInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
193193
/// for preRA scheduling with the source level scheduler.
194194
virtual bool enableMachineSchedDefaultSched() const { return true; }
195195

196+
/// True if the subtarget should run MachinePipeliner
197+
virtual bool enableMachinePipeliner() const { return true; };
198+
196199
/// True if the subtarget should enable joining global copies.
197200
///
198201
/// By default this is enabled if the machine scheduler is enabled, but

lib/CodeGen/MachinePipeliner.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
187187
!EnableSWPOptSize.getPosition())
188188
return false;
189189

190+
if (!mf.getSubtarget().enableMachinePipeliner())
191+
return false;
192+
190193
// Cannot pipeline loops without instruction itineraries if we are using
191194
// DFA for the pipeliner.
192195
if (mf.getSubtarget().useDFAforSMS() &&
@@ -2026,6 +2029,10 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
20262029
InstrMapTy InstrMap;
20272030

20282031
SmallVector<MachineBasicBlock *, 4> PrologBBs;
2032+
2033+
MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2034+
assert(PreheaderBB != nullptr &&
2035+
"Need to add code to handle loops w/o preheader");
20292036
// Generate the prolog instructions that set up the pipeline.
20302037
generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
20312038
MF.insert(BB->getIterator(), KernelBB);
@@ -2082,7 +2089,7 @@ void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
20822089
removeDeadInstructions(KernelBB, EpilogBBs);
20832090

20842091
// Add branches between prolog and epilog blocks.
2085-
addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2092+
addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
20862093

20872094
// Remove the original loop since it's no longer referenced.
20882095
for (auto &I : *BB)
@@ -2767,7 +2774,8 @@ static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
27672774
/// Create branches from each prolog basic block to the appropriate epilog
27682775
/// block. These edges are needed if the loop ends before reaching the
27692776
/// kernel.
2770-
void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
2777+
void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB,
2778+
MBBVectorTy &PrologBBs,
27712779
MachineBasicBlock *KernelBB,
27722780
MBBVectorTy &EpilogBBs,
27732781
SMSchedule &Schedule, ValueMapTy *VRMap) {
@@ -2794,8 +2802,8 @@ void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
27942802
// Check if the LOOP0 has already been removed. If so, then there is no need
27952803
// to reduce the trip count.
27962804
if (LC != 0)
2797-
LC = TII->reduceLoopCount(*Prolog, IndVar, *Cmp, Cond, PrevInsts, j,
2798-
MaxIter);
2805+
LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond,
2806+
PrevInsts, j, MaxIter);
27992807

28002808
// Record the value of the first trip count, which is used to determine if
28012809
// branches and blocks can be removed for constant trip counts.

lib/Target/Hexagon/HexagonInstrInfo.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -697,11 +697,11 @@ bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
697697
/// Generate code to reduce the loop iteration by one and check if the loop is
698698
/// finished. Return the value/register of the new loop count. this function
699699
/// assumes the nth iteration is peeled first.
700-
unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
701-
MachineInstr *IndVar, MachineInstr &Cmp,
702-
SmallVectorImpl<MachineOperand> &Cond,
703-
SmallVectorImpl<MachineInstr *> &PrevInsts,
704-
unsigned Iter, unsigned MaxIter) const {
700+
unsigned HexagonInstrInfo::reduceLoopCount(
701+
MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
702+
MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
703+
SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
704+
unsigned MaxIter) const {
705705
// We expect a hardware loop currently. This means that IndVar is set
706706
// to null, and the compare is the ENDLOOP instruction.
707707
assert((!IndVar) && isEndLoopN(Cmp.getOpcode())

lib/Target/Hexagon/HexagonInstrInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
139139
/// is finished. Return the value/register of the new loop count. We need
140140
/// this function when peeling off one or more iterations of a loop. This
141141
/// function assumes the nth iteration is peeled first.
142-
unsigned reduceLoopCount(MachineBasicBlock &MBB,
142+
unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
143143
MachineInstr *IndVar, MachineInstr &Cmp,
144144
SmallVectorImpl<MachineOperand> &Cond,
145145
SmallVectorImpl<MachineInstr *> &PrevInsts,

lib/Target/PowerPC/PPCInstrInfo.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3922,3 +3922,77 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
39223922
}
39233923
return false;
39243924
}
3925+
3926+
bool PPCInstrInfo::isBDNZ(unsigned Opcode) const {
3927+
return (Opcode == (Subtarget.isPPC64() ? PPC::BDNZ8 : PPC::BDNZ));
3928+
}
3929+
3930+
bool PPCInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
3931+
MachineInstr *&CmpInst) const {
3932+
MachineBasicBlock *LoopEnd = L.getBottomBlock();
3933+
MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
3934+
// We really "analyze" only CTR loops right now.
3935+
if (I != LoopEnd->end() && isBDNZ(I->getOpcode())) {
3936+
IndVarInst = nullptr;
3937+
CmpInst = &*I;
3938+
return false;
3939+
}
3940+
return true;
3941+
}
3942+
3943+
MachineInstr *
3944+
PPCInstrInfo::findLoopInstr(MachineBasicBlock &PreHeader) const {
3945+
3946+
unsigned LOOPi = (Subtarget.isPPC64() ? PPC::MTCTR8loop : PPC::MTCTRloop);
3947+
3948+
// The loop set-up instruction should be in preheader
3949+
for (auto &I : PreHeader.instrs())
3950+
if (I.getOpcode() == LOOPi)
3951+
return &I;
3952+
return nullptr;
3953+
}
3954+
3955+
unsigned PPCInstrInfo::reduceLoopCount(
3956+
MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
3957+
MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
3958+
SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
3959+
unsigned MaxIter) const {
3960+
// We expect a hardware loop currently. This means that IndVar is set
3961+
// to null, and the compare is the ENDLOOP instruction.
3962+
assert((!IndVar) && isBDNZ(Cmp.getOpcode()) && "Expecting a CTR loop");
3963+
MachineFunction *MF = MBB.getParent();
3964+
DebugLoc DL = Cmp.getDebugLoc();
3965+
MachineInstr *Loop = findLoopInstr(PreHeader);
3966+
if (!Loop)
3967+
return 0;
3968+
unsigned LoopCountReg = Loop->getOperand(0).getReg();
3969+
MachineRegisterInfo &MRI = MF->getRegInfo();
3970+
MachineInstr *LoopCount = MRI.getUniqueVRegDef(LoopCountReg);
3971+
3972+
if (!LoopCount)
3973+
return 0;
3974+
// If the loop trip count is a compile-time value, then just change the
3975+
// value.
3976+
if (LoopCount->getOpcode() == PPC::LI8 || LoopCount->getOpcode() == PPC::LI) {
3977+
int64_t Offset = LoopCount->getOperand(1).getImm();
3978+
if (Offset <= 1) {
3979+
LoopCount->eraseFromParent();
3980+
Loop->eraseFromParent();
3981+
return 0;
3982+
}
3983+
LoopCount->getOperand(1).setImm(Offset - 1);
3984+
return Offset - 1;
3985+
}
3986+
3987+
// The loop trip count is a run-time value.
3988+
// We need to subtract one from the trip count,
3989+
// and insert branch later to check if we're done with the loop.
3990+
3991+
// Since BDZ/BDZ8 that we will insert will also decrease the ctr by 1,
3992+
// so we don't need to generate any thing here.
3993+
Cond.push_back(MachineOperand::CreateImm(0));
3994+
Cond.push_back(MachineOperand::CreateReg(
3995+
Subtarget.isPPC64() ? PPC::CTR8 : PPC::CTR, true));
3996+
return LoopCountReg;
3997+
}
3998+

lib/Target/PowerPC/PPCInstrInfo.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,34 @@ class PPCInstrInfo : public PPCGenInstrInfo {
457457
}
458458
return Reg;
459459
}
460+
461+
/// Check \p Opcode is BDNZ (Decrement CTR and branch if it is still nonzero).
462+
bool isBDNZ(unsigned Opcode) const;
463+
464+
/// Find the hardware loop instruction used to set-up the specified loop.
465+
/// On PPC, we have two instructions used to set-up the hardware loop
466+
/// (MTCTRloop, MTCTR8loop) with corresponding endloop (BDNZ, BDNZ8)
467+
/// instructions to indicate the end of a loop.
468+
MachineInstr *findLoopInstr(MachineBasicBlock &PreHeader) const;
469+
470+
/// Analyze the loop code to find the loop induction variable and compare used
471+
/// to compute the number of iterations. Currently, we analyze loop that are
472+
/// controlled using hardware loops. In this case, the induction variable
473+
/// instruction is null. For all other cases, this function returns true,
474+
/// which means we're unable to analyze it. \p IndVarInst and \p CmpInst will
475+
/// return new values when we can analyze the readonly loop \p L, otherwise,
476+
/// nothing got changed
477+
bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
478+
MachineInstr *&CmpInst) const override;
479+
/// Generate code to reduce the loop iteration by one and check if the loop
480+
/// is finished. Return the value/register of the new loop count. We need
481+
/// this function when peeling off one or more iterations of a loop. This
482+
/// function assumes the last iteration is peeled first.
483+
unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
484+
MachineInstr *IndVar, MachineInstr &Cmp,
485+
SmallVectorImpl<MachineOperand> &Cond,
486+
SmallVectorImpl<MachineInstr *> &PrevInsts,
487+
unsigned Iter, unsigned MaxIter) const override;
460488
};
461489

462490
}

lib/Target/PowerPC/PPCSubtarget.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
3939
cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
4040
cl::Hidden);
4141

42+
static cl::opt<bool>
43+
EnableMachinePipeliner("ppc-enable-pipeliner",
44+
cl::desc("Enable Machine Pipeliner for PPC"),
45+
cl::init(false), cl::Hidden);
46+
4247
PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
4348
StringRef FS) {
4449
initializeEnvironment();
@@ -181,10 +186,14 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
181186
return false;
182187
}
183188

184-
bool PPCSubtarget::enableMachineScheduler() const {
185-
return true;
189+
bool PPCSubtarget::enableMachineScheduler() const { return true; }
190+
191+
bool PPCSubtarget::enableMachinePipeliner() const {
192+
return (DarwinDirective == PPC::DIR_PWR9) && EnableMachinePipeliner;
186193
}
187194

195+
bool PPCSubtarget::useDFAforSMS() const { return false; }
196+
188197
// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
189198
bool PPCSubtarget::enablePostRAScheduler() const { return true; }
190199

lib/Target/PowerPC/PPCSubtarget.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,9 +322,13 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
322322
/// but may expand the ISEL instruction later.
323323
bool enableEarlyIfConversion() const override { return true; }
324324

325-
// Scheduling customization.
325+
/// Scheduling customization.
326326
bool enableMachineScheduler() const override;
327-
// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
327+
/// Pipeliner customization.
328+
bool enableMachinePipeliner() const override;
329+
/// Machine Pipeliner customization
330+
bool useDFAforSMS() const override;
331+
/// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
328332
bool enablePostRAScheduler() const override;
329333
AntiDepBreakMode getAntiDepBreakMode() const override;
330334
void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;

lib/Target/PowerPC/PPCTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,9 @@ void PPCPassConfig::addPreRegAlloc() {
488488
}
489489
if (EnableExtraTOCRegDeps)
490490
addPass(createPPCTOCRegDepsPass());
491+
492+
if (getOptLevel() != CodeGenOpt::None)
493+
addPass(&MachinePipelinerID);
491494
}
492495

493496
void PPCPassConfig::addPreSched2() {

test/CodeGen/PowerPC/sms-simple.ll

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \
3+
; RUN: -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr9 --ppc-enable-pipeliner \
4+
; RUN: | FileCheck %s
5+
6+
@x = dso_local local_unnamed_addr global <{ i32, i32, i32, i32, [1020 x i32] }> <{ i32 1, i32 2, i32 3, i32 4, [1020 x i32] zeroinitializer }>, align 4
7+
@y = common dso_local global [1024 x i32] zeroinitializer, align 4
8+
9+
; Function Attrs: norecurse nounwind
10+
define dso_local i32* @foo() local_unnamed_addr #0 {
11+
; CHECK-LABEL: foo:
12+
; CHECK: # %bb.0: # %entry
13+
; CHECK-NEXT: addis r5, r2, x@toc@ha
14+
; CHECK-NEXT: addi r5, r5, x@toc@l
15+
; CHECK-NEXT: addis r6, r2, y@toc@ha
16+
; CHECK-NEXT: li r7, 340
17+
; CHECK-NEXT: addi r3, r6, y@toc@l
18+
; CHECK-NEXT: lwz r6, y@toc@l(r6)
19+
; CHECK-NEXT: mtctr r7
20+
; CHECK-NEXT: addi r5, r5, -8
21+
; CHECK-NEXT: lwzu r7, 12(r5)
22+
; CHECK-NEXT: maddld r6, r7, r7, r6
23+
; CHECK-NEXT: lwz r7, 4(r5)
24+
; CHECK-NEXT: addi r4, r3, -8
25+
; CHECK-NEXT: stwu r6, 12(r4)
26+
; CHECK-NEXT: maddld r6, r7, r7, r6
27+
; CHECK-NEXT: lwz r7, 8(r5)
28+
; CHECK-NEXT: .p2align 4
29+
; CHECK-NEXT: .LBB0_1: # %for.body
30+
; CHECK: maddld r7, r7, r7, r6
31+
; CHECK-NEXT: lwzu r8, 12(r5)
32+
; CHECK-NEXT: maddld r8, r8, r8, r7
33+
; CHECK-NEXT: stw r6, 4(r4)
34+
; CHECK-NEXT: lwz r6, 4(r5)
35+
; CHECK-NEXT: maddld r6, r6, r6, r8
36+
; CHECK-NEXT: stw r7, 8(r4)
37+
; CHECK-NEXT: lwz r7, 8(r5)
38+
; CHECK-NEXT: stwu r8, 12(r4)
39+
; CHECK-NEXT: bdnz .LBB0_1
40+
; CHECK-NEXT: # %bb.2:
41+
; CHECK-NEXT: maddld r5, r7, r7, r6
42+
; CHECK-NEXT: stw r6, 4(r4)
43+
; CHECK-NEXT: stw r5, 8(r4)
44+
; CHECK-NEXT: blr
45+
entry:
46+
%.pre = load i32, i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0), align 4
47+
br label %for.body
48+
49+
for.cond.cleanup: ; preds = %for.body
50+
ret i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0)
51+
52+
for.body: ; preds = %for.body, %entry
53+
%0 = phi i32 [ %.pre, %entry ], [ %add.2, %for.body ]
54+
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next.2, %for.body ]
55+
%arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv
56+
%1 = load i32, i32* %arrayidx2, align 4
57+
%mul = mul nsw i32 %1, %1
58+
%add = add nsw i32 %mul, %0
59+
%arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv
60+
store i32 %add, i32* %arrayidx6, align 4
61+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
62+
%arrayidx2.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next
63+
%2 = load i32, i32* %arrayidx2.1, align 4
64+
%mul.1 = mul nsw i32 %2, %2
65+
%add.1 = add nsw i32 %mul.1, %add
66+
%arrayidx6.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next
67+
store i32 %add.1, i32* %arrayidx6.1, align 4
68+
%indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
69+
%arrayidx2.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next.1
70+
%3 = load i32, i32* %arrayidx2.2, align 4
71+
%mul.2 = mul nsw i32 %3, %3
72+
%add.2 = add nsw i32 %mul.2, %add.1
73+
%arrayidx6.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next.1
74+
store i32 %add.2, i32* %arrayidx6.2, align 4
75+
%indvars.iv.next.2 = add nuw nsw i64 %indvars.iv, 3
76+
%exitcond.2 = icmp eq i64 %indvars.iv.next.2, 1024
77+
br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
78+
}

0 commit comments

Comments
 (0)