Skip to content

Commit 0775f21

Browse files
committed
AMDGPU: Add a scheduler test to demonstrate an upcoming change
1 parent 07ed1d6 commit 0775f21

File tree

1 file changed

+144
-0
lines changed

1 file changed

+144
-0
lines changed
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
3+
4+
; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
5+
; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
6+
; which currently looks to the scheduler like an occupancy reduction, even
7+
; though it's not. TODO: Fix!
8+
9+
; 6 kB of LDS, allows 10 workgroups
10+
@lds = internal addrspace(3) global [384 x <4 x i32>] undef
11+
12+
define internal void @copy(ptr addrspace(1) %src, i32 %ofs) alwaysinline {
13+
%src.gep = getelementptr <4 x i32>, ptr addrspace(1) %src, i32 %ofs
14+
%ld = load <4 x i32>, ptr addrspace(1) %src.gep
15+
%dst.gep = getelementptr <4 x i32>, ptr addrspace(3) @lds, i32 %ofs
16+
store <4 x i32> %ld, ptr addrspace(3) %dst.gep
17+
ret void
18+
}
19+
20+
define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"="32,32" {
21+
; CHECK-LABEL: test:
22+
; CHECK: ; %bb.0:
23+
; CHECK-NEXT: s_clause 0xa
24+
; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off
25+
; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
26+
; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32
27+
; CHECK-NEXT: global_load_b128 v[14:17], v[0:1], off offset:48
28+
; CHECK-NEXT: global_load_b128 v[18:21], v[0:1], off offset:64
29+
; CHECK-NEXT: global_load_b128 v[22:25], v[0:1], off offset:80
30+
; CHECK-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96
31+
; CHECK-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112
32+
; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:128
33+
; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:144
34+
; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:160
35+
; CHECK-NEXT: v_mov_b32_e32 v86, 0
36+
; CHECK-NEXT: s_clause 0x8
37+
; CHECK-NEXT: global_load_b128 v[46:49], v[0:1], off offset:176
38+
; CHECK-NEXT: global_load_b128 v[50:53], v[0:1], off offset:240
39+
; CHECK-NEXT: global_load_b128 v[54:57], v[0:1], off offset:224
40+
; CHECK-NEXT: global_load_b128 v[58:61], v[0:1], off offset:208
41+
; CHECK-NEXT: global_load_b128 v[62:65], v[0:1], off offset:192
42+
; CHECK-NEXT: global_load_b128 v[66:69], v[0:1], off offset:304
43+
; CHECK-NEXT: global_load_b128 v[70:73], v[0:1], off offset:288
44+
; CHECK-NEXT: global_load_b128 v[74:77], v[0:1], off offset:272
45+
; CHECK-NEXT: global_load_b128 v[78:81], v[0:1], off offset:256
46+
; CHECK-NEXT: s_waitcnt vmcnt(19)
47+
; CHECK-NEXT: ds_store_b128 v86, v[2:5]
48+
; CHECK-NEXT: s_waitcnt vmcnt(18)
49+
; CHECK-NEXT: ds_store_b128 v86, v[6:9] offset:16
50+
; CHECK-NEXT: s_waitcnt vmcnt(17)
51+
; CHECK-NEXT: ds_store_b128 v86, v[10:13] offset:32
52+
; CHECK-NEXT: s_waitcnt vmcnt(16)
53+
; CHECK-NEXT: ds_store_b128 v86, v[14:17] offset:48
54+
; CHECK-NEXT: s_waitcnt vmcnt(15)
55+
; CHECK-NEXT: ds_store_b128 v86, v[18:21] offset:64
56+
; CHECK-NEXT: s_waitcnt vmcnt(14)
57+
; CHECK-NEXT: ds_store_b128 v86, v[22:25] offset:80
58+
; CHECK-NEXT: s_waitcnt vmcnt(13)
59+
; CHECK-NEXT: ds_store_b128 v86, v[26:29] offset:96
60+
; CHECK-NEXT: s_waitcnt vmcnt(12)
61+
; CHECK-NEXT: ds_store_b128 v86, v[30:33] offset:112
62+
; CHECK-NEXT: s_waitcnt vmcnt(11)
63+
; CHECK-NEXT: ds_store_b128 v86, v[34:37] offset:128
64+
; CHECK-NEXT: s_waitcnt vmcnt(10)
65+
; CHECK-NEXT: ds_store_b128 v86, v[38:41] offset:144
66+
; CHECK-NEXT: s_waitcnt vmcnt(9)
67+
; CHECK-NEXT: ds_store_b128 v86, v[42:45] offset:160
68+
; CHECK-NEXT: s_clause 0xb
69+
; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off offset:368
70+
; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:352
71+
; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:336
72+
; CHECK-NEXT: global_load_b128 v[14:17], v[0:1], off offset:320
73+
; CHECK-NEXT: global_load_b128 v[18:21], v[0:1], off offset:432
74+
; CHECK-NEXT: global_load_b128 v[22:25], v[0:1], off offset:416
75+
; CHECK-NEXT: global_load_b128 v[26:29], v[0:1], off offset:400
76+
; CHECK-NEXT: global_load_b128 v[30:33], v[0:1], off offset:384
77+
; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:464
78+
; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:448
79+
; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:480
80+
; CHECK-NEXT: global_load_b128 v[82:85], v[0:1], off offset:496
81+
; CHECK-NEXT: s_waitcnt vmcnt(20)
82+
; CHECK-NEXT: ds_store_b128 v86, v[46:49] offset:176
83+
; CHECK-NEXT: s_waitcnt vmcnt(16)
84+
; CHECK-NEXT: ds_store_b128 v86, v[62:65] offset:192
85+
; CHECK-NEXT: ds_store_b128 v86, v[58:61] offset:208
86+
; CHECK-NEXT: ds_store_b128 v86, v[54:57] offset:224
87+
; CHECK-NEXT: ds_store_b128 v86, v[50:53] offset:240
88+
; CHECK-NEXT: s_waitcnt vmcnt(12)
89+
; CHECK-NEXT: ds_store_b128 v86, v[78:81] offset:256
90+
; CHECK-NEXT: ds_store_b128 v86, v[74:77] offset:272
91+
; CHECK-NEXT: ds_store_b128 v86, v[70:73] offset:288
92+
; CHECK-NEXT: ds_store_b128 v86, v[66:69] offset:304
93+
; CHECK-NEXT: s_waitcnt vmcnt(8)
94+
; CHECK-NEXT: ds_store_b128 v86, v[14:17] offset:320
95+
; CHECK-NEXT: ds_store_b128 v86, v[10:13] offset:336
96+
; CHECK-NEXT: ds_store_b128 v86, v[6:9] offset:352
97+
; CHECK-NEXT: ds_store_b128 v86, v[2:5] offset:368
98+
; CHECK-NEXT: s_waitcnt vmcnt(4)
99+
; CHECK-NEXT: ds_store_b128 v86, v[30:33] offset:384
100+
; CHECK-NEXT: ds_store_b128 v86, v[26:29] offset:400
101+
; CHECK-NEXT: ds_store_b128 v86, v[22:25] offset:416
102+
; CHECK-NEXT: ds_store_b128 v86, v[18:21] offset:432
103+
; CHECK-NEXT: s_waitcnt vmcnt(2)
104+
; CHECK-NEXT: ds_store_b128 v86, v[38:41] offset:448
105+
; CHECK-NEXT: ds_store_b128 v86, v[34:37] offset:464
106+
; CHECK-NEXT: s_waitcnt vmcnt(1)
107+
; CHECK-NEXT: ds_store_b128 v86, v[42:45] offset:480
108+
; CHECK-NEXT: s_waitcnt vmcnt(0)
109+
; CHECK-NEXT: ds_store_b128 v86, v[82:85] offset:496
110+
; CHECK-NEXT: s_endpgm
111+
call void @copy(ptr addrspace(1) %src, i32 0)
112+
call void @copy(ptr addrspace(1) %src, i32 1)
113+
call void @copy(ptr addrspace(1) %src, i32 2)
114+
call void @copy(ptr addrspace(1) %src, i32 3)
115+
call void @copy(ptr addrspace(1) %src, i32 4)
116+
call void @copy(ptr addrspace(1) %src, i32 5)
117+
call void @copy(ptr addrspace(1) %src, i32 6)
118+
call void @copy(ptr addrspace(1) %src, i32 7)
119+
call void @copy(ptr addrspace(1) %src, i32 8)
120+
call void @copy(ptr addrspace(1) %src, i32 9)
121+
call void @copy(ptr addrspace(1) %src, i32 10)
122+
call void @copy(ptr addrspace(1) %src, i32 11)
123+
call void @copy(ptr addrspace(1) %src, i32 12)
124+
call void @copy(ptr addrspace(1) %src, i32 13)
125+
call void @copy(ptr addrspace(1) %src, i32 14)
126+
call void @copy(ptr addrspace(1) %src, i32 15)
127+
call void @copy(ptr addrspace(1) %src, i32 16)
128+
call void @copy(ptr addrspace(1) %src, i32 17)
129+
call void @copy(ptr addrspace(1) %src, i32 18)
130+
call void @copy(ptr addrspace(1) %src, i32 19)
131+
call void @copy(ptr addrspace(1) %src, i32 20)
132+
call void @copy(ptr addrspace(1) %src, i32 21)
133+
call void @copy(ptr addrspace(1) %src, i32 22)
134+
call void @copy(ptr addrspace(1) %src, i32 23)
135+
call void @copy(ptr addrspace(1) %src, i32 24)
136+
call void @copy(ptr addrspace(1) %src, i32 25)
137+
call void @copy(ptr addrspace(1) %src, i32 26)
138+
call void @copy(ptr addrspace(1) %src, i32 27)
139+
call void @copy(ptr addrspace(1) %src, i32 28)
140+
call void @copy(ptr addrspace(1) %src, i32 29)
141+
call void @copy(ptr addrspace(1) %src, i32 30)
142+
call void @copy(ptr addrspace(1) %src, i32 31)
143+
ret void
144+
}

0 commit comments

Comments
 (0)