Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 6c4ec69

Browse files
author
Chad Rosier
committed
[ARM64] Ports the Cortex-A53 Machine Model description from AArch64.
Summary: This port includes the rudimentary latencies that were provided for the Cortex-A53 Machine Model in the AArch64 backend. It also changes the SchedAlias for COPY in the Cyclone model to an explicit WriteRes mapping to avoid conflicts in other subtargets. Differential Revision: http://reviews.llvm.org/D3427 Patch by Dave Estes <[email protected]>! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206652 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent ed56048 commit 6c4ec69

File tree

4 files changed

+247
-4
lines changed

4 files changed

+247
-4
lines changed

lib/Target/ARM64/ARM64.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ include "llvm/Target/Target.td"
2121
//
2222

2323
def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
24-
"Enable ARMv8 FP">;
24+
"Enable ARMv8 FP">;
2525

2626
def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
2727
"Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
@@ -56,6 +56,7 @@ def ARM64InstrInfo : InstrInfo;
5656
//===----------------------------------------------------------------------===//
5757
// ARM64 Processors supported.
5858
//
59+
include "ARM64SchedA53.td"
5960
include "ARM64SchedCyclone.td"
6061

6162
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
@@ -79,9 +80,8 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
7980

8081
def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON]>;
8182

82-
def : ProcessorModel<"cortex-a53", NoSchedModel, [ProcA53]>;
83+
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
8384
def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
84-
8585
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
8686

8787
//===----------------------------------------------------------------------===//

lib/Target/ARM64/ARM64SchedA53.td

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
//=- ARM64SchedA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// This file defines the itinerary class data for the ARM Cortex A53 processors.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
// ===---------------------------------------------------------------------===//
15+
// The following definitions describe the simpler per-operand machine model.
16+
// This works with MachineScheduler. See MCSchedModel.h for details.
17+
18+
// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
19+
def CortexA53Model : SchedMachineModel {
20+
let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
21+
let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
22+
let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
23+
let LoadLatency = 2; // Optimistic load latency assuming bypass.
24+
// This is overriden by OperandCycles if the
25+
// Itineraries are queried instead.
26+
let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
27+
// Specification - Instruction Timings"
28+
// v 1.0 Spreadsheet
29+
}
30+
31+
32+
//===----------------------------------------------------------------------===//
33+
// Define each kind of processor resource and number available.
34+
35+
// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
36+
// Cortex-A53 is in-order.
37+
38+
def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU
39+
def A53UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC
40+
def A53UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division
41+
def A53UnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store
42+
def A53UnitB : ProcResource<1> { let BufferSize = 0; } // Branch
43+
def A53UnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU
44+
def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
45+
46+
47+
//===----------------------------------------------------------------------===//
48+
// Subtarget-specific SchedWrite types which both map the ProcResources and
49+
// set the latency.
50+
51+
let SchedModel = CortexA53Model in {
52+
53+
// ALU - These are reduced to 1 despite a true latency of 4 in order to easily
54+
// model forwarding logic. Once forwarding is properly modelled, then
55+
// they'll be corrected.
56+
def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 1; }
57+
def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 1; }
58+
def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 1; }
59+
def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 1; }
60+
def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 1; }
61+
def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 1; }
62+
def : WriteRes<WriteAdr, [A53UnitALU]> { let Latency = 1; }
63+
64+
// MAC
65+
def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
66+
def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
67+
68+
// Div
69+
def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
70+
def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
71+
72+
// Load
73+
def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
74+
def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
75+
def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
76+
def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 4; }
77+
78+
// Store
79+
def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
80+
def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
81+
def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
82+
def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
83+
def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 4; }
84+
85+
// Branch
86+
def : WriteRes<WriteBr, [A53UnitB]>;
87+
def : WriteRes<WriteBrReg, [A53UnitB]>;
88+
def : WriteRes<WriteSys, [A53UnitB]>;
89+
def : WriteRes<WriteBarrier, [A53UnitB]>;
90+
def : WriteRes<WriteHint, [A53UnitB]>;
91+
92+
// FP ALU
93+
def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
94+
def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
95+
def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
96+
def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
97+
def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
98+
def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
99+
100+
// FP Mul, Div, Sqrt
101+
def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
102+
def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
103+
let ResourceCycles = [29]; }
104+
def A53WriteFDiv : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
105+
let ResourceCycles = [29]; }
106+
def A53WriteFSqrt : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
107+
let ResourceCycles = [28]; }
108+
109+
//===----------------------------------------------------------------------===//
110+
// Subtarget-specific SchedRead types.
111+
112+
// While there is no forwarding information defined for these SchedRead types,
113+
// they are still used by some instruction via a SchedRW list and so these zero
114+
// SchedReadAdvances are required.
115+
116+
def : ReadAdvance<ReadExtrHi, 0>;
117+
def : ReadAdvance<ReadAdrBase, 0>;
118+
def : ReadAdvance<ReadVLD, 0>;
119+
120+
//===----------------------------------------------------------------------===//
121+
// Subtarget-specific InstRWs.
122+
123+
def : InstRW<[WriteI], (instrs COPY)>;
124+
def : InstRW<[WriteLD], (instregex "LD[1-4]")>;
125+
def : InstRW<[WriteST], (instregex "ST[1-4]")>;
126+
def : InstRW<[A53WriteFDiv], (instregex "^FDIV")>;
127+
def : InstRW<[A53WriteFSqrt], (instregex ".*SQRT.*")>;
128+
129+
}

lib/Target/ARM64/ARM64SchedCyclone.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,9 @@ def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
342342
// INS V[x],V[y] is a WriteV.
343343

344344
// FMOVWSr,FMOVXDr,FMOVXDHighr
345-
def : SchedAlias<WriteFCopy, WriteVLD>;
345+
def : WriteRes<WriteFCopy, [CyUnitLS]> {
346+
let Latency = 5;
347+
}
346348

347349
// FMOVSWr,FMOVDXr
348350
def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
; REQUIRES: asserts
2+
; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
3+
;
4+
; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
5+
; much higher than the ADD instructions in order to hide latency. When not
6+
; specifying a subtarget, the MADD will remain near the end of the block.
7+
;
8+
; CHECK: ********** MI Scheduling **********
9+
; CHECK: main
10+
; CHECK: *** Final schedule for BB#2 ***
11+
; CHECK: SU(13)
12+
; CHECK: MADDWrrr
13+
; CHECK: SU(4)
14+
; CHECK: ADDWri
15+
; CHECK: ********** INTERVALS **********
16+
@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
17+
@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
18+
19+
; Function Attrs: nounwind
20+
define i32 @main() #0 {
21+
entry:
22+
%retval = alloca i32, align 4
23+
%x = alloca [8 x i32], align 4
24+
%y = alloca [8 x i32], align 4
25+
%i = alloca i32, align 4
26+
%xx = alloca i32, align 4
27+
%yy = alloca i32, align 4
28+
store i32 0, i32* %retval
29+
%0 = bitcast [8 x i32]* %x to i8*
30+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
31+
%1 = bitcast [8 x i32]* %y to i8*
32+
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
33+
store i32 0, i32* %xx, align 4
34+
store i32 0, i32* %yy, align 4
35+
store i32 0, i32* %i, align 4
36+
br label %for.cond
37+
38+
for.cond: ; preds = %for.inc, %entry
39+
%2 = load i32* %i, align 4
40+
%cmp = icmp slt i32 %2, 8
41+
br i1 %cmp, label %for.body, label %for.end
42+
43+
for.body: ; preds = %for.cond
44+
%3 = load i32* %i, align 4
45+
%idxprom = sext i32 %3 to i64
46+
%arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
47+
%4 = load i32* %arrayidx, align 4
48+
%add = add nsw i32 %4, 1
49+
store i32 %add, i32* %xx, align 4
50+
%5 = load i32* %xx, align 4
51+
%add1 = add nsw i32 %5, 12
52+
store i32 %add1, i32* %xx, align 4
53+
%6 = load i32* %xx, align 4
54+
%add2 = add nsw i32 %6, 23
55+
store i32 %add2, i32* %xx, align 4
56+
%7 = load i32* %xx, align 4
57+
%add3 = add nsw i32 %7, 34
58+
store i32 %add3, i32* %xx, align 4
59+
%8 = load i32* %i, align 4
60+
%idxprom4 = sext i32 %8 to i64
61+
%arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
62+
%9 = load i32* %arrayidx5, align 4
63+
%10 = load i32* %yy, align 4
64+
%mul = mul nsw i32 %10, %9
65+
store i32 %mul, i32* %yy, align 4
66+
br label %for.inc
67+
68+
for.inc: ; preds = %for.body
69+
%11 = load i32* %i, align 4
70+
%inc = add nsw i32 %11, 1
71+
store i32 %inc, i32* %i, align 4
72+
br label %for.cond
73+
74+
for.end: ; preds = %for.cond
75+
%12 = load i32* %xx, align 4
76+
%13 = load i32* %yy, align 4
77+
%add6 = add nsw i32 %12, %13
78+
ret i32 %add6
79+
}
80+
81+
82+
; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to
83+
; hide latency. Whereas normally there would only be a single FADDvvv_4s
84+
; after it, this test checks to make sure there are more than one.
85+
;
86+
; CHECK: ********** MI Scheduling **********
87+
; CHECK: neon4xfloat:BB#0
88+
; CHECK: *** Final schedule for BB#0 ***
89+
; CHECK: FDIVv4f32
90+
; CHECK: FADDv4f32
91+
; CHECK: FADDv4f32
92+
; CHECK: ********** INTERVALS **********
93+
define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
94+
%tmp1 = fadd <4 x float> %A, %B;
95+
%tmp2 = fadd <4 x float> %A, %tmp1;
96+
%tmp3 = fadd <4 x float> %A, %tmp2;
97+
%tmp4 = fadd <4 x float> %A, %tmp3;
98+
%tmp5 = fadd <4 x float> %A, %tmp4;
99+
%tmp6 = fadd <4 x float> %A, %tmp5;
100+
%tmp7 = fadd <4 x float> %A, %tmp6;
101+
%tmp8 = fadd <4 x float> %A, %tmp7;
102+
%tmp9 = fdiv <4 x float> %A, %B;
103+
%tmp10 = fadd <4 x float> %tmp8, %tmp9;
104+
105+
ret <4 x float> %tmp10
106+
}
107+
108+
; Function Attrs: nounwind
109+
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
110+
111+
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
112+
attributes #1 = { nounwind }

0 commit comments

Comments
 (0)