Skip to content

Commit f08a2fc

Browse files
committed
[MCA] Add tests for IPC on Cortex-A55
The tests compare IPC statistics that MCA provides with IPC values measured on Cortex-A55 hardware. For hardware tests, each snippet is run in a loop unrolled by 1000, and IPC is measured by linux-perf. Several tests do not match the hardware: the skewed ALU is not supported, LDR seem to be missing a forwarding path. Differential Revision: https://reviews.llvm.org/D98174
1 parent 3f6753e commit f08a2fc

File tree

12 files changed

+216
-0
lines changed

12 files changed

+216
-0
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
add w8, w8, #1
5+
6+
# CHECK: Iterations: 1000
7+
# CHECK-NEXT: Instructions: 1000
8+
# CHECK-NEXT: Total Cycles: 1003
9+
# CHECK-NEXT: Total uOps: 1000
10+
11+
# CHECK: Dispatch Width: 2
12+
# CHECK-NEXT: uOps Per Cycle: 1.00
13+
# CHECK-NEXT: IPC: 1.00
14+
# CHECK-NEXT: Block RThroughput: 0.5
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
add w8, w8, #1
5+
add w9, w9, #1
6+
7+
# CHECK: Iterations: 1000
8+
# CHECK-NEXT: Instructions: 2000
9+
# CHECK-NEXT: Total Cycles: 1003
10+
# CHECK-NEXT: Total uOps: 2000
11+
12+
# CHECK: Dispatch Width: 2
13+
# CHECK-NEXT: uOps Per Cycle: 1.99
14+
# CHECK-NEXT: IPC: 1.99
15+
# CHECK-NEXT: Block RThroughput: 1.0
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
fmadd s3, s5, s6, s7
5+
fmadd s8, s9, s10, s11
6+
7+
# CHECK: Iterations: 1000
8+
# CHECK-NEXT: Instructions: 2000
9+
# CHECK-NEXT: Total Cycles: 1004
10+
# CHECK-NEXT: Total uOps: 2000
11+
12+
# CHECK: Dispatch Width: 2
13+
# CHECK-NEXT: uOps Per Cycle: 1.99
14+
# CHECK-NEXT: IPC: 1.99
15+
# CHECK-NEXT: Block RThroughput: 1.0
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
# FMADD writes and retires out-of-order
5+
fmadd s3, s5, s6, s7
6+
# ADD instructions are issued and retire in-order
7+
add w8, w8, #1
8+
add w9, w9, #1
9+
add w10, w10, #1
10+
11+
# CHECK: Iterations: 1000
12+
# CHECK-NEXT: Instructions: 4000
13+
# CHECK-NEXT: Total Cycles: 2003
14+
# CHECK-NEXT: Total uOps: 4000
15+
16+
# CHECK: Dispatch Width: 2
17+
# CHECK-NEXT: uOps Per Cycle: 2.00
18+
# CHECK-NEXT: IPC: 2.00
19+
# CHECK-NEXT: Block RThroughput: 2.0
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
2+
# CHECK: IPC:
3+
# CHECK-SAME: 2.00
4+
#
5+
# XFAIL: *
6+
#
7+
# Cortex-A55 has a secondary skewed ALU in the Ex1 stage for simple
8+
# ALU instructions that do not require shifting or saturation
9+
# resources. Results from the skewed ALU are available 1 cycle earlier.
10+
#
11+
# This features allows the first and the second instruction to be
12+
# dual-issued despite a register dependency (w8).
13+
#
14+
# MCA and LLVM scheduling model do not support this yet.
15+
16+
add w8, w8, #1
17+
add w10, w8, #1
18+
add w12, w8, #1
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
add w8, w8, #1
5+
add w12, w8, #1
6+
mul w10, w10, w10
7+
8+
# CHECK: Iterations: 1000
9+
# CHECK-NEXT: Instructions: 3000
10+
# CHECK-NEXT: Total Cycles: 3003
11+
# CHECK-NEXT: Total uOps: 3000
12+
13+
# CHECK: Dispatch Width: 2
14+
# CHECK-NEXT: uOps Per Cycle: 1.00
15+
# CHECK-NEXT: IPC: 1.00
16+
# CHECK-NEXT: Block RThroughput: 1.5
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
# DIV is not modeled precisely: on hardware it takes variable
5+
# number of cycles depending on its operands, but LLVM scheduling
6+
# model only provides an average latency.
7+
8+
add w8, w8, #1
9+
movz w10, #1, lsl #16
10+
movz w12, #32768, lsl #16
11+
sdiv w10, w12, w10
12+
13+
# CHECK: Iterations: 1000
14+
# CHECK-NEXT: Instructions: 4000
15+
# CHECK-NEXT: Total Cycles: 8004
16+
# CHECK-NEXT: Total uOps: 4000
17+
18+
# CHECK: Dispatch Width: 2
19+
# CHECK-NEXT: uOps Per Cycle: 0.50
20+
# CHECK-NEXT: IPC: 0.50
21+
# CHECK-NEXT: Block RThroughput: 8.0
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
# DIV is not modeled precisely: on hardware it takes variable
5+
# number of cycles depending on its operands. LLVM scheduling model
6+
# only provides an average latency.
7+
8+
add w8, w8, #1
9+
movz w10, #1, lsl #16
10+
movz w12, #32768, lsl #16
11+
mul w11, w8, w8
12+
sdiv w10, w12, w10
13+
14+
# CHECK: Iterations: 1000
15+
# CHECK-NEXT: Instructions: 5000
16+
# CHECK-NEXT: Total Cycles: 8004
17+
# CHECK-NEXT: Total uOps: 5000
18+
19+
# CHECK: Dispatch Width: 2
20+
# CHECK-NEXT: uOps Per Cycle: 0.62
21+
# CHECK-NEXT: IPC: 0.62
22+
# CHECK-NEXT: Block RThroughput: 8.0
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
# It appears that ADD and MUL fuse together, if both can be issued in
5+
# one cycle:
6+
#
7+
# add w12, w8, #1
8+
# mul w10, w12, w10
9+
#
10+
# FIXME: MCA (and LLVM scheduling model) do not support this. The test
11+
# case uses different registers to break the pattern.
12+
13+
add w8, w8, #1
14+
add w13, w8, #1
15+
mul w10, w12, w10
16+
17+
# CHECK: Iterations: 1000
18+
# CHECK-NEXT: Instructions: 3000
19+
# CHECK-NEXT: Total Cycles: 3003
20+
# CHECK-NEXT: Total uOps: 3000
21+
22+
# CHECK: Dispatch Width: 2
23+
# CHECK-NEXT: uOps Per Cycle: 1.00
24+
# CHECK-NEXT: IPC: 1.00
25+
# CHECK-NEXT: Block RThroughput: 1.5
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
add w8, w8, #1
5+
add w12, w9, #1
6+
cmp w9, #42
7+
mul w10, w12, w10
8+
9+
# CHECK: Iterations: 1000
10+
# CHECK-NEXT: Instructions: 4000
11+
# CHECK-NEXT: Total Cycles: 3004
12+
# CHECK-NEXT: Total uOps: 4000
13+
14+
# CHECK: Dispatch Width: 2
15+
# CHECK-NEXT: uOps Per Cycle: 1.33
16+
# CHECK-NEXT: IPC: 1.33
17+
# CHECK-NEXT: Block RThroughput: 2.0
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
2+
# CHECK: IPC:
3+
# CHECK-SAME: 1.50
4+
#
5+
# XFAIL: *
6+
#
7+
# MCA reports IPC = 0.60, while hardware shows IPC = 1.50.
8+
#
9+
# 1) The skewed ALU on Cortex-A55 is not modeled: ADD and AND
10+
# instructions should be issued in the same cycle.
11+
# See A55-2.s test for more details.
12+
#
13+
# 2) Cortex-A55 manual mentions that there is a forwarding path from
14+
# the ALU pipeline to the LD/ST pipeline. This is not implemented in
15+
# the LLVM scheduling model.
16+
17+
add w8, w8, #1
18+
and w12, w8, #0x3f
19+
ldr w14, [x10, w12, uxtw #2]
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
3+
4+
fabs s0, s1
5+
fabs s2, s3
6+
7+
# CHECK: Iterations: 1000
8+
# CHECK-NEXT: Instructions: 2000
9+
# CHECK-NEXT: Total Cycles: 1004
10+
# CHECK-NEXT: Total uOps: 2000
11+
12+
# CHECK: Dispatch Width: 2
13+
# CHECK-NEXT: uOps Per Cycle: 1.99
14+
# CHECK-NEXT: IPC: 1.99
15+
# CHECK-NEXT: Block RThroughput: 1.0

0 commit comments

Comments
 (0)