Skip to content

Commit 8089bd4

Browse files
committed
[RISCV] Add GPR bypasses for most scalar integer instructions to the SiFive7 scheduler model.
SiFive7's scalar execution consists of 4 stages AG, M1, M2, WB. Most simple arithmetic and branch instructions can execute in either AG or M2. If the operands are ready, the instruction will execute in the AG stage. Otherwise, it executes in the M2 stage. Everything is fully bypassed, so dependent instructions should only see 1 cycle latency. This patch adds ReadAdvances to pretend that these instructions execute in the M2 ALU and reads their operands then. This allows the scheduler to schedule dependent instructions back to back. I've increased branch latency to 3 since they are also executed in both stages. Still need to fix JALR, but I want to cleanup some scheduler classes first. Multiply, cpop and division instructions can only start in the AG stage. Still need to do some work for FP instructions that produce integer results. I've added an llvm-mca test that creates a long dependency chain. The timeline view can show that things are bypassed. I didn't check all permutations, but we have some variety. Reviewed By: wangpc Differential Revision: https://reviews.llvm.org/D153666
1 parent 6fcc562 commit 8089bd4

File tree

3 files changed

+698
-21
lines changed

3 files changed

+698
-21
lines changed

llvm/lib/Target/RISCV/RISCVSchedSiFive7.td

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,20 @@ class SiFive7GetOrderedReductionCycles<string mx, int sew> {
198198
int c = !mul(5, VLUpperBound);
199199
}
200200

201+
class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2>
202+
: ReadAdvance<read, cycles, [WriteIALU, WriteIALU32,
203+
WriteShiftImm, WriteShiftImm32,
204+
WriteShiftReg, WriteShiftReg32,
205+
WriteSHXADD, WriteSHXADD32,
206+
WriteRotateImm, WriteRotateImm32,
207+
WriteRotateReg, WriteRotateReg32,
208+
WriteCLZ, WriteCLZ32, WriteCTZ, WriteCTZ32,
209+
WriteCPOP, WriteCPOP32,
210+
WriteREV8, WriteORCB, WriteSFB,
211+
WriteIMul, WriteIMul32,
212+
WriteIDiv, WriteIDiv32,
213+
WriteLDB, WriteLDH, WriteLDW, WriteLDD]>;
214+
201215
// SiFive7 machine model for scheduling and other instruction cost heuristics.
202216
def SiFive7Model : SchedMachineModel {
203217
let MicroOpBufferSize = 0; // Explicitly set to zero since SiFive7 is in-order.
@@ -234,10 +248,12 @@ def SiFive7VS : ProcResource<1> { let Super = SiFive7PipeV; } // Store
234248
def SiFive7PipeAB : ProcResGroup<[SiFive7PipeA, SiFive7PipeB]>;
235249

236250
// Branching
251+
let Latency = 3 in {
237252
def : WriteRes<WriteJmp, [SiFive7PipeB]>;
238253
def : WriteRes<WriteJal, [SiFive7PipeB]>;
239254
def : WriteRes<WriteJalr, [SiFive7PipeB]>;
240255
def : WriteRes<WriteJmpReg, [SiFive7PipeB]>;
256+
}
241257

242258
//Short forward branch
243259
def : WriteRes<WriteSFB, [SiFive7PipeA, SiFive7PipeB]> {
@@ -896,18 +912,19 @@ let Latency = 3 in
896912
def : InstRW<[WriteIALU], (instrs COPY)>;
897913

898914
//===----------------------------------------------------------------------===//
915+
899916
// Bypass and advance
900-
def : ReadAdvance<ReadJmp, 0>;
917+
def : SiFive7AnyToGPRBypass<ReadJmp>;
901918
def : ReadAdvance<ReadJalr, 0>;
902919
def : ReadAdvance<ReadCSR, 0>;
903920
def : ReadAdvance<ReadStoreData, 0>;
904921
def : ReadAdvance<ReadMemBase, 0>;
905-
def : ReadAdvance<ReadIALU, 0>;
906-
def : ReadAdvance<ReadIALU32, 0>;
907-
def : ReadAdvance<ReadShiftImm, 0>;
908-
def : ReadAdvance<ReadShiftImm32, 0>;
909-
def : ReadAdvance<ReadShiftReg, 0>;
910-
def : ReadAdvance<ReadShiftReg32, 0>;
922+
def : SiFive7AnyToGPRBypass<ReadIALU>;
923+
def : SiFive7AnyToGPRBypass<ReadIALU32>;
924+
def : SiFive7AnyToGPRBypass<ReadShiftImm>;
925+
def : SiFive7AnyToGPRBypass<ReadShiftImm32>;
926+
def : SiFive7AnyToGPRBypass<ReadShiftReg>;
927+
def : SiFive7AnyToGPRBypass<ReadShiftReg32>;
911928
def : ReadAdvance<ReadIDiv, 0>;
912929
def : ReadAdvance<ReadIDiv32, 0>;
913930
def : ReadAdvance<ReadIMul, 0>;
@@ -974,24 +991,24 @@ def : ReadAdvance<ReadFClass16, 0>;
974991
def : ReadAdvance<ReadFClass32, 0>;
975992
def : ReadAdvance<ReadFClass64, 0>;
976993

977-
def : ReadAdvance<ReadSFBJmp, 0>;
978-
def : ReadAdvance<ReadSFBALU, 0>;
994+
def : SiFive7AnyToGPRBypass<ReadSFBJmp, 0>;
995+
def : SiFive7AnyToGPRBypass<ReadSFBALU, 0>;
979996

980997
// Bitmanip
981-
def : ReadAdvance<ReadRotateImm, 0>;
982-
def : ReadAdvance<ReadRotateImm32, 0>;
983-
def : ReadAdvance<ReadRotateReg, 0>;
984-
def : ReadAdvance<ReadRotateReg32, 0>;
985-
def : ReadAdvance<ReadCLZ, 0>;
986-
def : ReadAdvance<ReadCLZ32, 0>;
987-
def : ReadAdvance<ReadCTZ, 0>;
988-
def : ReadAdvance<ReadCTZ32, 0>;
998+
def : SiFive7AnyToGPRBypass<ReadRotateImm>;
999+
def : SiFive7AnyToGPRBypass<ReadRotateImm32>;
1000+
def : SiFive7AnyToGPRBypass<ReadRotateReg>;
1001+
def : SiFive7AnyToGPRBypass<ReadRotateReg32>;
1002+
def : SiFive7AnyToGPRBypass<ReadCLZ>;
1003+
def : SiFive7AnyToGPRBypass<ReadCLZ32>;
1004+
def : SiFive7AnyToGPRBypass<ReadCTZ>;
1005+
def : SiFive7AnyToGPRBypass<ReadCTZ32>;
9891006
def : ReadAdvance<ReadCPOP, 0>;
9901007
def : ReadAdvance<ReadCPOP32, 0>;
991-
def : ReadAdvance<ReadORCB, 0>;
992-
def : ReadAdvance<ReadREV8, 0>;
993-
def : ReadAdvance<ReadSHXADD, 0>;
994-
def : ReadAdvance<ReadSHXADD32, 0>;
1008+
def : SiFive7AnyToGPRBypass<ReadORCB>;
1009+
def : SiFive7AnyToGPRBypass<ReadREV8>;
1010+
def : SiFive7AnyToGPRBypass<ReadSHXADD>;
1011+
def : SiFive7AnyToGPRBypass<ReadSHXADD32>;
9951012

9961013
// 6. Configuration-Setting Instructions
9971014
def : ReadAdvance<ReadVSETVLI, 2>;
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-u74 -mattr=+c -timeline \
3+
# RUN: -timeline-max-cycles=1000 -iterations=1 < %s | FileCheck %s
4+
5+
c.lui a0, 1
6+
c.add a0, a0
7+
c.addi a0, 1
8+
c.addw a0, a0
9+
c.addiw a0, 1
10+
c.sub a0, a0
11+
c.subw a0, a0
12+
c.and a0, a0
13+
c.andi a0, 1
14+
c.or a0, a0
15+
c.xor a0, a0
16+
c.slli a0, 1
17+
c.srli a0, 1
18+
c.srai a0, 1
19+
c.add a0, a0
20+
beqz a0, 1f
21+
1:
22+
c.add a0, a0
23+
bnez a0, 1f
24+
1:
25+
26+
# CHECK: Iterations: 1
27+
# CHECK-NEXT: Instructions: 18
28+
# CHECK-NEXT: Total Cycles: 24
29+
# CHECK-NEXT: Total uOps: 18
30+
31+
# CHECK: Dispatch Width: 2
32+
# CHECK-NEXT: uOps Per Cycle: 0.75
33+
# CHECK-NEXT: IPC: 0.75
34+
# CHECK-NEXT: Block RThroughput: 9.0
35+
36+
# CHECK: Instruction Info:
37+
# CHECK-NEXT: [1]: #uOps
38+
# CHECK-NEXT: [2]: Latency
39+
# CHECK-NEXT: [3]: RThroughput
40+
# CHECK-NEXT: [4]: MayLoad
41+
# CHECK-NEXT: [5]: MayStore
42+
# CHECK-NEXT: [6]: HasSideEffects (U)
43+
44+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
45+
# CHECK-NEXT: 1 3 0.50 lui a0, 1
46+
# CHECK-NEXT: 1 3 0.50 add a0, a0, a0
47+
# CHECK-NEXT: 1 3 0.50 addi a0, a0, 1
48+
# CHECK-NEXT: 1 3 0.50 addw a0, a0, a0
49+
# CHECK-NEXT: 1 3 0.50 addiw a0, a0, 1
50+
# CHECK-NEXT: 1 3 0.50 sub a0, a0, a0
51+
# CHECK-NEXT: 1 3 0.50 subw a0, a0, a0
52+
# CHECK-NEXT: 1 3 0.50 and a0, a0, a0
53+
# CHECK-NEXT: 1 3 0.50 andi a0, a0, 1
54+
# CHECK-NEXT: 1 3 0.50 or a0, a0, a0
55+
# CHECK-NEXT: 1 3 0.50 xor a0, a0, a0
56+
# CHECK-NEXT: 1 3 0.50 slli a0, a0, 1
57+
# CHECK-NEXT: 1 3 0.50 srli a0, a0, 1
58+
# CHECK-NEXT: 1 3 0.50 srai a0, a0, 1
59+
# CHECK-NEXT: 1 3 0.50 add a0, a0, a0
60+
# CHECK-NEXT: 1 3 1.00 beqz a0, .Ltmp0
61+
# CHECK-NEXT: 1 3 0.50 add a0, a0, a0
62+
# CHECK-NEXT: 1 3 1.00 bnez a0, .Ltmp1
63+
64+
# CHECK: Resources:
65+
# CHECK-NEXT: [0] - SiFive7FDiv
66+
# CHECK-NEXT: [1] - SiFive7IDiv
67+
# CHECK-NEXT: [2] - SiFive7PipeA
68+
# CHECK-NEXT: [3] - SiFive7PipeB
69+
# CHECK-NEXT: [4] - SiFive7PipeV
70+
# CHECK-NEXT: [5] - SiFive7VA
71+
# CHECK-NEXT: [6] - SiFive7VL
72+
# CHECK-NEXT: [7] - SiFive7VS
73+
74+
# CHECK: Resource pressure per iteration:
75+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7]
76+
# CHECK-NEXT: - - 8.00 10.00 - - - -
77+
78+
# CHECK: Resource pressure by instruction:
79+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] Instructions:
80+
# CHECK-NEXT: - - - 1.00 - - - - lui a0, 1
81+
# CHECK-NEXT: - - 1.00 - - - - - add a0, a0, a0
82+
# CHECK-NEXT: - - - 1.00 - - - - addi a0, a0, 1
83+
# CHECK-NEXT: - - 1.00 - - - - - addw a0, a0, a0
84+
# CHECK-NEXT: - - - 1.00 - - - - addiw a0, a0, 1
85+
# CHECK-NEXT: - - 1.00 - - - - - sub a0, a0, a0
86+
# CHECK-NEXT: - - - 1.00 - - - - subw a0, a0, a0
87+
# CHECK-NEXT: - - 1.00 - - - - - and a0, a0, a0
88+
# CHECK-NEXT: - - - 1.00 - - - - andi a0, a0, 1
89+
# CHECK-NEXT: - - 1.00 - - - - - or a0, a0, a0
90+
# CHECK-NEXT: - - - 1.00 - - - - xor a0, a0, a0
91+
# CHECK-NEXT: - - 1.00 - - - - - slli a0, a0, 1
92+
# CHECK-NEXT: - - - 1.00 - - - - srli a0, a0, 1
93+
# CHECK-NEXT: - - 1.00 - - - - - srai a0, a0, 1
94+
# CHECK-NEXT: - - - 1.00 - - - - add a0, a0, a0
95+
# CHECK-NEXT: - - - 1.00 - - - - beqz a0, .Ltmp0
96+
# CHECK-NEXT: - - 1.00 - - - - - add a0, a0, a0
97+
# CHECK-NEXT: - - - 1.00 - - - - bnez a0, .Ltmp1
98+
99+
# CHECK: Timeline view:
100+
# CHECK-NEXT: 0123456789
101+
# CHECK-NEXT: Index 0123456789 0123
102+
103+
# CHECK: [0,0] DeeE . . . . . lui a0, 1
104+
# CHECK-NEXT: [0,1] .DeeE. . . . . add a0, a0, a0
105+
# CHECK-NEXT: [0,2] . DeeE . . . . addi a0, a0, 1
106+
# CHECK-NEXT: [0,3] . DeeE . . . . addw a0, a0, a0
107+
# CHECK-NEXT: [0,4] . DeeE . . . . addiw a0, a0, 1
108+
# CHECK-NEXT: [0,5] . DeeE . . . . sub a0, a0, a0
109+
# CHECK-NEXT: [0,6] . .DeeE. . . . subw a0, a0, a0
110+
# CHECK-NEXT: [0,7] . . DeeE . . . and a0, a0, a0
111+
# CHECK-NEXT: [0,8] . . DeeE . . . andi a0, a0, 1
112+
# CHECK-NEXT: [0,9] . . DeeE . . . or a0, a0, a0
113+
# CHECK-NEXT: [0,10] . . DeeE . . . xor a0, a0, a0
114+
# CHECK-NEXT: [0,11] . . .DeeE. . . slli a0, a0, 1
115+
# CHECK-NEXT: [0,12] . . . DeeE . . srli a0, a0, 1
116+
# CHECK-NEXT: [0,13] . . . DeeE . . srai a0, a0, 1
117+
# CHECK-NEXT: [0,14] . . . DeeE . . add a0, a0, a0
118+
# CHECK-NEXT: [0,15] . . . . DeeE . beqz a0, .Ltmp0
119+
# CHECK-NEXT: [0,16] . . . . DeeE . add a0, a0, a0
120+
# CHECK-NEXT: [0,17] . . . . DeeE bnez a0, .Ltmp1
121+
122+
# CHECK: Average Wait times (based on the timeline view):
123+
# CHECK-NEXT: [0]: Executions
124+
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
125+
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
126+
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
127+
128+
# CHECK: [0] [1] [2] [3]
129+
# CHECK-NEXT: 0. 1 0.0 0.0 0.0 lui a0, 1
130+
# CHECK-NEXT: 1. 1 0.0 0.0 0.0 add a0, a0, a0
131+
# CHECK-NEXT: 2. 1 0.0 0.0 0.0 addi a0, a0, 1
132+
# CHECK-NEXT: 3. 1 0.0 0.0 0.0 addw a0, a0, a0
133+
# CHECK-NEXT: 4. 1 0.0 0.0 0.0 addiw a0, a0, 1
134+
# CHECK-NEXT: 5. 1 0.0 0.0 0.0 sub a0, a0, a0
135+
# CHECK-NEXT: 6. 1 0.0 0.0 0.0 subw a0, a0, a0
136+
# CHECK-NEXT: 7. 1 0.0 0.0 0.0 and a0, a0, a0
137+
# CHECK-NEXT: 8. 1 0.0 0.0 0.0 andi a0, a0, 1
138+
# CHECK-NEXT: 9. 1 0.0 0.0 0.0 or a0, a0, a0
139+
# CHECK-NEXT: 10. 1 0.0 0.0 0.0 xor a0, a0, a0
140+
# CHECK-NEXT: 11. 1 0.0 0.0 0.0 slli a0, a0, 1
141+
# CHECK-NEXT: 12. 1 0.0 0.0 0.0 srli a0, a0, 1
142+
# CHECK-NEXT: 13. 1 0.0 0.0 0.0 srai a0, a0, 1
143+
# CHECK-NEXT: 14. 1 0.0 0.0 0.0 add a0, a0, a0
144+
# CHECK-NEXT: 15. 1 0.0 0.0 0.0 beqz a0, .Ltmp0
145+
# CHECK-NEXT: 16. 1 0.0 0.0 0.0 add a0, a0, a0
146+
# CHECK-NEXT: 17. 1 0.0 0.0 0.0 bnez a0, .Ltmp1
147+
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>

0 commit comments

Comments
 (0)