Skip to content

Commit ab76052

Browse files
authored
AMDGPU: Treat SWMMAC the same as MFMA and other WMMA for sched_barrier (#85721)
1 parent 9ed1aa3 commit ab76052

File tree

2 files changed

+334
-1
lines changed

2 files changed

+334
-1
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -815,7 +815,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
815815
}
816816

817817
static bool isMFMAorWMMA(const MachineInstr &MI) {
818-
return isMFMA(MI) || isWMMA(MI);
818+
return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
819819
}
820820

821821
static bool isSWMMAC(const MachineInstr &MI) {
Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
4+
5+
declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16)
6+
7+
define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
8+
; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
9+
; GCN: ; %bb.0: ; %entry
10+
; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
11+
; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0
12+
; GCN-NEXT: v_mov_b32_e32 v48, 0
13+
; GCN-NEXT: s_wait_kmcnt 0x0
14+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
15+
; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28
16+
; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
17+
; GCN-NEXT: ds_load_b128 v[8:11], v0
18+
; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512
19+
; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536
20+
; GCN-NEXT: ds_load_b128 v[20:23], v0 offset:3072
21+
; GCN-NEXT: ds_load_b128 v[24:27], v0 offset:5120
22+
; GCN-NEXT: ds_load_b128 v[4:7], v0 offset:11280
23+
; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264
24+
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
25+
; GCN-NEXT: s_wait_dscnt 0x6
26+
; GCN-NEXT: v_mov_b32_e32 v31, v11
27+
; GCN-NEXT: s_wait_dscnt 0x5
28+
; GCN-NEXT: v_mov_b32_e32 v35, v15
29+
; GCN-NEXT: s_wait_dscnt 0x4
30+
; GCN-NEXT: v_mov_b32_e32 v39, v19
31+
; GCN-NEXT: s_wait_dscnt 0x3
32+
; GCN-NEXT: v_mov_b32_e32 v43, v23
33+
; GCN-NEXT: s_wait_dscnt 0x2
34+
; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
35+
; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
36+
; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
37+
; GCN-NEXT: v_mov_b32_e32 v32, v12
38+
; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
39+
; GCN-NEXT: v_mov_b32_e32 v36, v16
40+
; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
41+
; GCN-NEXT: v_mov_b32_e32 v40, v20
42+
; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
43+
; GCN-NEXT: v_mov_b32_e32 v44, v24
44+
; GCN-NEXT: s_wait_dscnt 0x0
45+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
46+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
47+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
48+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
49+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
50+
; GCN-NEXT: ds_store_b128 v49, v[28:31]
51+
; GCN-NEXT: ds_store_b128 v50, v[32:35] offset:512
52+
; GCN-NEXT: ds_store_b128 v50, v[36:39] offset:1024
53+
; GCN-NEXT: ds_store_b128 v50, v[40:43] offset:1536
54+
; GCN-NEXT: ds_store_b128 v50, v[44:47] offset:2048
55+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
56+
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
57+
; GCN-NEXT: s_endpgm
58+
;
59+
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
60+
; EXACTCUTOFF: ; %bb.0: ; %entry
61+
; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
62+
; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0
63+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0
64+
; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0
65+
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2)
66+
; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28
67+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
68+
; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0
69+
; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512
70+
; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536
71+
; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v0 offset:3072
72+
; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v0 offset:5120
73+
; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v0 offset:11280
74+
; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264
75+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
76+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6
77+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11
78+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5
79+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15
80+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4
81+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19
82+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3
83+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23
84+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2
85+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
86+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
87+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
88+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12
89+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
90+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16
91+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
92+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20
93+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
94+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24
95+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
96+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
97+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
98+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
99+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
100+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
101+
; EXACTCUTOFF-NEXT: ds_store_b128 v49, v[28:31]
102+
; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[32:35] offset:512
103+
; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[36:39] offset:1024
104+
; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[40:43] offset:1536
105+
; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[44:47] offset:2048
106+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
107+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
108+
; EXACTCUTOFF-NEXT: s_endpgm
109+
entry:
110+
%idx = call i32 @llvm.amdgcn.workitem.id.x()
111+
%load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx
112+
%load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
113+
%load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32
114+
%load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
115+
%load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64
116+
%load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
117+
%load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96
118+
%load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
119+
%load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128
120+
%load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
121+
%load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192
122+
%load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
123+
%mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
124+
%mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
125+
%mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
126+
%mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
127+
%mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
128+
%store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
129+
store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
130+
%store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
131+
store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
132+
%store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
133+
store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
134+
%store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
135+
store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
136+
%store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
137+
store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
138+
; 7 DS read
139+
call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0)
140+
; 5 SWMMAC
141+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0)
142+
; 5 DS write
143+
call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0)
144+
ret void
145+
}
146+
147+
define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
148+
; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
149+
; GCN: ; %bb.0: ; %entry
150+
; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
151+
; GCN-NEXT: v_mov_b32_e32 v18, 0
152+
; GCN-NEXT: s_wait_kmcnt 0x0
153+
; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0
154+
; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1
155+
; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024
156+
; GCN-NEXT: ds_load_b128 v[1:4], v17
157+
; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16
158+
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
159+
; GCN-NEXT: s_wait_dscnt 0x2
160+
; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
161+
; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
162+
; GCN-NEXT: s_wait_dscnt 0x0
163+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
164+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
165+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
166+
; GCN-NEXT: ds_store_b128 v0, v[13:16]
167+
; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560
168+
; GCN-NEXT: v_mov_b32_e32 v0, s1
169+
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
170+
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
171+
; GCN-NEXT: s_wait_dscnt 0x0
172+
; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
173+
; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
174+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
175+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
176+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
177+
; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512
178+
; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608
179+
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
180+
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
181+
; GCN-NEXT: s_wait_dscnt 0x0
182+
; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
183+
; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
184+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
185+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
186+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
187+
; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024
188+
; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168
189+
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
190+
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
191+
; GCN-NEXT: s_wait_dscnt 0x0
192+
; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
193+
; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
194+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
195+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
196+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
197+
; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536
198+
; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240
199+
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
200+
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
201+
; GCN-NEXT: s_wait_dscnt 0x0
202+
; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
203+
; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
204+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
205+
; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
206+
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
207+
; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048
208+
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
209+
; GCN-NEXT: s_endpgm
210+
;
211+
; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
212+
; EXACTCUTOFF: ; %bb.0: ; %entry
213+
; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
214+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0
215+
; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0
216+
; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0
217+
; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1
218+
; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024
219+
; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17
220+
; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16
221+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
222+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2
223+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
224+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
225+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
226+
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
227+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
228+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
229+
; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16]
230+
; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560
231+
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1
232+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
233+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
234+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
235+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
236+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
237+
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
238+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
239+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
240+
; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512
241+
; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608
242+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
243+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
244+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
245+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
246+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
247+
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
248+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
249+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
250+
; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024
251+
; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168
252+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
253+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
254+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
255+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
256+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
257+
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
258+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
259+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
260+
; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536
261+
; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240
262+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
263+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
264+
; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
265+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
266+
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
267+
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
268+
; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
269+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
270+
; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048
271+
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
272+
; EXACTCUTOFF-NEXT: s_endpgm
273+
entry:
274+
%idx = call i32 @llvm.amdgcn.workitem.id.x()
275+
%load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx
276+
%load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
277+
%load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64
278+
%load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
279+
%load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96
280+
%load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
281+
%load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128
282+
%load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
283+
%load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160
284+
%load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
285+
%load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192
286+
%load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
287+
%mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
288+
%mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
289+
%mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
290+
%mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
291+
%mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
292+
%store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
293+
store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
294+
%store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
295+
store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
296+
%store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
297+
store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
298+
%store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
299+
store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
300+
%store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
301+
store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
302+
; 3 DS read
303+
call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0)
304+
; 1 SWMMAC
305+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
306+
; 1 DS write
307+
call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
308+
; 1 DS read
309+
call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
310+
; 1 SWMMAC
311+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
312+
; 1 DS write
313+
call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
314+
; 1 DS read
315+
call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
316+
; 1 SWMMAC
317+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
318+
; 1 DS write
319+
call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
320+
; 1 DS read
321+
call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
322+
; 1 SWMMAC
323+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
324+
; 1 DS write
325+
call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
326+
; 1 DS read
327+
call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
328+
; 1 SWMMAC
329+
call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
330+
; 1 DS write
331+
call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
332+
ret void
333+
}

0 commit comments

Comments
 (0)