Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 7725fd8

Browse files
committed
AMDGPU/SI: Improve MachineSchedModel definition
This patch contains a few improvements to the model, including: - Using a single resource with a defined buffers size for each memory unit. - Setting the IssueWidth correctly. - Fixing latency values for memory instructions. shader-db stats: 16429 shaders in 3231 tests Totals: SGPRS: 318232 -> 312328 (-1.86 %) VGPRS: 208996 -> 209346 (0.17 %) Code Size: 7147044 -> 7166440 (0.27 %) bytes LDS: 83 -> 83 (0.00 %) blocks Scratch: 1862656 -> 1459200 (-21.66 %) bytes per wave Max Waves: 49182 -> 49243 (0.12 %) Wait states: 0 -> 0 (0.00 %)A Differential Revision: http://reviews.llvm.org/D18453 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@264877 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent d3adac5 commit 7725fd8

File tree

10 files changed

+195
-181
lines changed

10 files changed

+195
-181
lines changed

lib/Target/AMDGPU/SISchedule.td

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -39,24 +39,32 @@ def Write64Bit : SchedWrite;
3939
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
4040
// instructions)
4141

42-
def SIFullSpeedModel : SchedMachineModel {
43-
let CompleteModel = 0;
44-
}
45-
def SIQuarterSpeedModel : SchedMachineModel {
42+
class SISchedMachineModel : SchedMachineModel {
4643
let CompleteModel = 0;
44+
let IssueWidth = 1;
4745
}
4846

49-
// BufferSize = 0 means the processors are in-order.
50-
let BufferSize = 0 in {
47+
def SIFullSpeedModel : SISchedMachineModel;
48+
def SIQuarterSpeedModel : SISchedMachineModel;
5149

5250
// XXX: Are the resource counts correct?
53-
def HWBranch : ProcResource<1>;
54-
def HWExport : ProcResource<7>; // Taken from S_WAITCNT
55-
def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT
56-
def HWSALU : ProcResource<1>;
57-
def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT
58-
def HWVALU : ProcResource<1>;
59-
51+
def HWBranch : ProcResource<1> {
52+
let BufferSize = 1;
53+
}
54+
def HWExport : ProcResource<1> {
55+
let BufferSize = 7; // Taken from S_WAITCNT
56+
}
57+
def HWLGKM : ProcResource<1> {
58+
let BufferSize = 31; // Taken from S_WAITCNT
59+
}
60+
def HWSALU : ProcResource<1> {
61+
let BufferSize = 1;
62+
}
63+
def HWVMEM : ProcResource<1> {
64+
let BufferSize = 15; // Taken from S_WAITCNT
65+
}
66+
def HWVALU : ProcResource<1> {
67+
let BufferSize = 1;
6068
}
6169

6270
class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
@@ -74,12 +82,12 @@ class HWVALUWriteRes<SchedWrite write, int latency> :
7482
// The latency values are 1 / (operations / cycle) / 4.
7583
multiclass SICommonWriteRes {
7684

77-
def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ???
78-
def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ???
79-
def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64
80-
def : HWWriteRes<WriteSALU, [HWSALU], 1>;
81-
def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ???
82-
def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600
85+
def : HWWriteRes<WriteBranch, [HWBranch], 8>;
86+
def : HWWriteRes<WriteExport, [HWExport], 4>;
87+
def : HWWriteRes<WriteLDS, [HWLGKM], 5>; // Can be between 2 and 64
88+
def : HWWriteRes<WriteSALU, [HWSALU], 1>;
89+
def : HWWriteRes<WriteSMEM, [HWLGKM], 5>;
90+
def : HWWriteRes<WriteVMEM, [HWVMEM], 80>;
8391
def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
8492

8593
def : HWVALUWriteRes<Write32Bit, 1>;

test/CodeGen/AMDGPU/ds_read2_offset_order.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88

99
; SI-LABEL: {{^}}offset_order:
1010

11-
; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}}
12-
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:2
13-
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
11+
; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:4{{$}}
12+
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
13+
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12
1414
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
1515

1616
define void @offset_order(float addrspace(1)* %out) {

test/CodeGen/AMDGPU/fceil64.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
1212
; FUNC-LABEL: {{^}}fceil_f64:
1313
; CI: v_ceil_f64_e32
1414
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
15-
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
16-
; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
17-
; SI: s_lshr_b64
18-
; SI: s_not_b64
19-
; SI: s_and_b64
15+
; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
16+
; SI-DAG: s_add_i32 [[A:s[0-9]+]], [[SEXP]], 0xfffffc01
17+
; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[A]]
18+
; SI-DAG: s_not_b64
19+
; SI-DAG: s_and_b64
2020
; SI-DAG: cmp_gt_i32
2121
; SI-DAG: cndmask_b32
2222
; SI-DAG: cndmask_b32

test/CodeGen/AMDGPU/ftrunc.f64.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
2424
; CI: v_trunc_f64_e32
2525

2626
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
27-
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
28-
; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
29-
; SI: s_lshr_b64
30-
; SI: s_not_b64
31-
; SI: s_and_b64
27+
; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
28+
; SI-DAG: s_add_i32 [[A:s[0-9]+]], [[SEXP]], 0xfffffc01
29+
; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[A]]
30+
; SI-DAG: s_not_b64
31+
; SI-DAG: s_and_b64
3232
; SI-DAG: cmp_gt_i32
3333
; SI-DAG: cndmask_b32
3434
; SI-DAG: cndmask_b32

test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
66
; FUNC-LABEL: {{^}}rsq_clamped_f64:
77
; SI: v_rsq_clamp_f64_e32
88

9-
; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
9+
; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}]
1010
; TODO: this constant should be folded:
11-
; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
12-
; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
13-
; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
11+
; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
12+
; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
13+
; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
14+
; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
1415
; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
15-
; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
1616
; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
1717
; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
1818

test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
2424
; FUNC-LABEL: {{^}}rsq_clamp_f64:
2525
; SI: v_rsq_clamp_f64_e32
2626

27-
; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
2827
; TODO: this constant should be folded:
29-
; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
30-
; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
31-
; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
28+
; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
29+
; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
30+
; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
31+
; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
32+
; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
3233
; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
33-
; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
3434
; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
3535
; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
3636
define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {

test/CodeGen/AMDGPU/llvm.memcpy.ll

Lines changed: 106 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -6,77 +6,77 @@ declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
66

77

88
; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
9-
; SI: ds_read_u8
10-
; SI: ds_read_u8
11-
; SI: ds_read_u8
12-
; SI: ds_read_u8
13-
; SI: ds_read_u8
14-
; SI: ds_read_u8
15-
; SI: ds_read_u8
16-
; SI: ds_read_u8
17-
18-
; SI: ds_read_u8
19-
; SI: ds_read_u8
20-
; SI: ds_read_u8
21-
; SI: ds_read_u8
22-
; SI: ds_read_u8
23-
; SI: ds_read_u8
24-
; SI: ds_read_u8
25-
; SI: ds_read_u8
26-
27-
; SI: ds_read_u8
28-
; SI: ds_read_u8
29-
; SI: ds_read_u8
30-
; SI: ds_read_u8
31-
; SI: ds_read_u8
32-
; SI: ds_read_u8
33-
; SI: ds_read_u8
34-
; SI: ds_read_u8
35-
36-
; SI: ds_read_u8
37-
; SI: ds_read_u8
38-
; SI: ds_read_u8
39-
; SI: ds_read_u8
40-
; SI: ds_read_u8
41-
; SI: ds_read_u8
42-
; SI: ds_read_u8
43-
; SI: ds_read_u8
44-
45-
; SI: ds_write_b8
46-
; SI: ds_write_b8
47-
; SI: ds_write_b8
48-
; SI: ds_write_b8
49-
; SI: ds_write_b8
50-
; SI: ds_write_b8
51-
; SI: ds_write_b8
52-
; SI: ds_write_b8
53-
54-
; SI: ds_write_b8
55-
; SI: ds_write_b8
56-
; SI: ds_write_b8
57-
; SI: ds_write_b8
58-
; SI: ds_write_b8
59-
; SI: ds_write_b8
60-
; SI: ds_write_b8
61-
; SI: ds_write_b8
62-
63-
; SI: ds_write_b8
64-
; SI: ds_write_b8
65-
; SI: ds_write_b8
66-
; SI: ds_write_b8
67-
; SI: ds_write_b8
68-
; SI: ds_write_b8
69-
; SI: ds_write_b8
70-
; SI: ds_write_b8
71-
72-
; SI: ds_write_b8
73-
; SI: ds_write_b8
74-
; SI: ds_write_b8
75-
; SI: ds_write_b8
76-
; SI: ds_write_b8
77-
; SI: ds_write_b8
78-
; SI: ds_write_b8
79-
; SI: ds_write_b8
9+
; SI-DAG: ds_read_u8
10+
; SI-DAG: ds_read_u8
11+
; SI-DAG: ds_read_u8
12+
; SI-DAG: ds_read_u8
13+
; SI-DAG: ds_read_u8
14+
; SI-DAG: ds_read_u8
15+
; SI-DAG: ds_read_u8
16+
; SI-DAG: ds_read_u8
17+
18+
; SI-DAG: ds_read_u8
19+
; SI-DAG: ds_read_u8
20+
; SI-DAG: ds_read_u8
21+
; SI-DAG: ds_read_u8
22+
; SI-DAG: ds_read_u8
23+
; SI-DAG: ds_read_u8
24+
; SI-DAG: ds_read_u8
25+
; SI-DAG: ds_read_u8
26+
27+
; SI-DAG: ds_read_u8
28+
; SI-DAG: ds_read_u8
29+
; SI-DAG: ds_read_u8
30+
; SI-DAG: ds_read_u8
31+
; SI-DAG: ds_read_u8
32+
; SI-DAG: ds_read_u8
33+
; SI-DAG: ds_read_u8
34+
; SI-DAG: ds_read_u8
35+
36+
; SI-DAG: ds_read_u8
37+
; SI-DAG: ds_read_u8
38+
; SI-DAG: ds_read_u8
39+
; SI-DAG: ds_read_u8
40+
; SI-DAG: ds_read_u8
41+
; SI-DAG: ds_read_u8
42+
; SI-DAG: ds_read_u8
43+
; SI-DAG: ds_read_u8
44+
45+
; SI-DAG: ds_write_b8
46+
; SI-DAG: ds_write_b8
47+
; SI-DAG: ds_write_b8
48+
; SI-DAG: ds_write_b8
49+
; SI-DAG: ds_write_b8
50+
; SI-DAG: ds_write_b8
51+
; SI-DAG: ds_write_b8
52+
; SI-DAG: ds_write_b8
53+
54+
; SI-DAG: ds_write_b8
55+
; SI-DAG: ds_write_b8
56+
; SI-DAG: ds_write_b8
57+
; SI-DAG: ds_write_b8
58+
; SI-DAG: ds_write_b8
59+
; SI-DAG: ds_write_b8
60+
; SI-DAG: ds_write_b8
61+
; SI-DAG: ds_write_b8
62+
63+
; SI-DAG: ds_write_b8
64+
; SI-DAG: ds_write_b8
65+
; SI-DAG: ds_write_b8
66+
; SI-DAG: ds_write_b8
67+
; SI-DAG: ds_write_b8
68+
; SI-DAG: ds_write_b8
69+
; SI-DAG: ds_write_b8
70+
; SI-DAG: ds_write_b8
71+
72+
; SI-DAG: ds_write_b8
73+
; SI-DAG: ds_write_b8
74+
; SI-DAG: ds_write_b8
75+
; SI-DAG: ds_write_b8
76+
; SI-DAG: ds_write_b8
77+
; SI-DAG: ds_write_b8
78+
; SI-DAG: ds_write_b8
79+
; SI-DAG: ds_write_b8
8080

8181
; SI: s_endpgm
8282
define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
@@ -87,41 +87,41 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %
8787
}
8888

8989
; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
90-
; SI: ds_read_u16
91-
; SI: ds_read_u16
92-
; SI: ds_read_u16
93-
; SI: ds_read_u16
94-
; SI: ds_read_u16
95-
; SI: ds_read_u16
96-
; SI: ds_read_u16
97-
; SI: ds_read_u16
98-
99-
; SI: ds_read_u16
100-
; SI: ds_read_u16
101-
; SI: ds_read_u16
102-
; SI: ds_read_u16
103-
; SI: ds_read_u16
104-
; SI: ds_read_u16
105-
; SI: ds_read_u16
106-
; SI: ds_read_u16
107-
108-
; SI: ds_write_b16
109-
; SI: ds_write_b16
110-
; SI: ds_write_b16
111-
; SI: ds_write_b16
112-
; SI: ds_write_b16
113-
; SI: ds_write_b16
114-
; SI: ds_write_b16
115-
; SI: ds_write_b16
116-
117-
; SI: ds_write_b16
118-
; SI: ds_write_b16
119-
; SI: ds_write_b16
120-
; SI: ds_write_b16
121-
; SI: ds_write_b16
122-
; SI: ds_write_b16
123-
; SI: ds_write_b16
124-
; SI: ds_write_b16
90+
; SI-DAG: ds_read_u16
91+
; SI-DAG: ds_read_u16
92+
; SI-DAG: ds_read_u16
93+
; SI-DAG: ds_read_u16
94+
; SI-DAG: ds_read_u16
95+
; SI-DAG: ds_read_u16
96+
; SI-DAG: ds_read_u16
97+
; SI-DAG: ds_read_u16
98+
99+
; SI-DAG: ds_read_u16
100+
; SI-DAG: ds_read_u16
101+
; SI-DAG: ds_read_u16
102+
; SI-DAG: ds_read_u16
103+
; SI-DAG: ds_read_u16
104+
; SI-DAG: ds_read_u16
105+
; SI-DAG: ds_read_u16
106+
; SI-DAG: ds_read_u16
107+
108+
; SI-DAG: ds_write_b16
109+
; SI-DAG: ds_write_b16
110+
; SI-DAG: ds_write_b16
111+
; SI-DAG: ds_write_b16
112+
; SI-DAG: ds_write_b16
113+
; SI-DAG: ds_write_b16
114+
; SI-DAG: ds_write_b16
115+
; SI-DAG: ds_write_b16
116+
117+
; SI-DAG: ds_write_b16
118+
; SI-DAG: ds_write_b16
119+
; SI-DAG: ds_write_b16
120+
; SI-DAG: ds_write_b16
121+
; SI-DAG: ds_write_b16
122+
; SI-DAG: ds_write_b16
123+
; SI-DAG: ds_write_b16
124+
; SI-DAG: ds_write_b16
125125

126126
; SI: s_endpgm
127127
define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {

test/CodeGen/AMDGPU/local-memory-two-objects.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@
3232
; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
3333
; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}}
3434
; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
35-
; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
36-
; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
35+
; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
36+
; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
3737

3838
define void @local_memory_two_objects(i32 addrspace(1)* %out) {
3939
entry:

test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,11 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
156156
}
157157

158158
; FUNC-LABEL: @reorder_local_offsets
159+
; FIXME: The scheduler doesn't think its proftible to re-order the
160+
; loads and stores, and I'm not sure that it really is.
161+
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
159162
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
160163
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
161-
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
162164
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
163165
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
164166
; CI: buffer_store_dword

0 commit comments

Comments
 (0)