Skip to content

Commit f5e76ab

Browse files
committed
Add start of IR test that probably needs to be redone
1 parent 364f162 commit f5e76ab

File tree

1 file changed

+195
-0
lines changed

1 file changed

+195
-0
lines changed
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mcpu=gfx90a < %s | FileCheck %s
3+
4+
target triple = "amdgcn-amd-amdhsa"
5+
6+
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrspace(1) %arg) #0 {
7+
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma:
8+
; CHECK: ; %bb.0: ; %bb
9+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
10+
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
11+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
12+
; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
13+
; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
14+
; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
15+
; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
16+
; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
17+
; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
18+
; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
19+
; CHECK-NEXT: s_nop 0
20+
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
21+
; CHECK-NEXT: s_waitcnt vmcnt(0)
22+
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
23+
; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
24+
; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
25+
; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
26+
; CHECK-NEXT: v_accvgpr_write_b32 a4, v4
27+
; CHECK-NEXT: v_accvgpr_write_b32 a5, v5
28+
; CHECK-NEXT: v_accvgpr_write_b32 a6, v6
29+
; CHECK-NEXT: v_accvgpr_write_b32 a7, v7
30+
; CHECK-NEXT: v_accvgpr_write_b32 a8, v8
31+
; CHECK-NEXT: v_accvgpr_write_b32 a9, v9
32+
; CHECK-NEXT: v_accvgpr_write_b32 a10, v10
33+
; CHECK-NEXT: v_accvgpr_write_b32 a11, v11
34+
; CHECK-NEXT: v_accvgpr_write_b32 a12, v12
35+
; CHECK-NEXT: v_accvgpr_write_b32 a13, v13
36+
; CHECK-NEXT: v_accvgpr_write_b32 a14, v14
37+
; CHECK-NEXT: v_accvgpr_write_b32 a15, v15
38+
; CHECK-NEXT: v_accvgpr_write_b32 a16, v16
39+
; CHECK-NEXT: v_accvgpr_write_b32 a17, v17
40+
; CHECK-NEXT: v_accvgpr_write_b32 a18, v18
41+
; CHECK-NEXT: v_accvgpr_write_b32 a19, v19
42+
; CHECK-NEXT: v_accvgpr_write_b32 a20, v20
43+
; CHECK-NEXT: v_accvgpr_write_b32 a21, v21
44+
; CHECK-NEXT: v_accvgpr_write_b32 a22, v22
45+
; CHECK-NEXT: v_accvgpr_write_b32 a23, v23
46+
; CHECK-NEXT: v_accvgpr_write_b32 a24, v24
47+
; CHECK-NEXT: v_accvgpr_write_b32 a25, v25
48+
; CHECK-NEXT: v_accvgpr_write_b32 a26, v26
49+
; CHECK-NEXT: v_accvgpr_write_b32 a27, v27
50+
; CHECK-NEXT: v_accvgpr_write_b32 a28, v28
51+
; CHECK-NEXT: v_accvgpr_write_b32 a29, v29
52+
; CHECK-NEXT: v_accvgpr_write_b32 a30, v30
53+
; CHECK-NEXT: v_accvgpr_write_b32 a31, v31
54+
; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
55+
; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
56+
; CHECK-NEXT: s_nop 1
57+
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
58+
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
59+
; CHECK-NEXT: s_nop 7
60+
; CHECK-NEXT: s_nop 7
61+
; CHECK-NEXT: s_nop 2
62+
; CHECK-NEXT: v_accvgpr_read_b32 v4, a59
63+
; CHECK-NEXT: v_accvgpr_read_b32 v5, a58
64+
; CHECK-NEXT: v_accvgpr_read_b32 v6, a57
65+
; CHECK-NEXT: v_accvgpr_read_b32 v7, a56
66+
; CHECK-NEXT: v_accvgpr_read_b32 v8, a55
67+
; CHECK-NEXT: v_accvgpr_read_b32 v9, a54
68+
; CHECK-NEXT: v_accvgpr_read_b32 v10, a53
69+
; CHECK-NEXT: v_accvgpr_read_b32 v11, a52
70+
; CHECK-NEXT: v_accvgpr_read_b32 v12, a51
71+
; CHECK-NEXT: v_accvgpr_read_b32 v13, a50
72+
; CHECK-NEXT: v_accvgpr_read_b32 v14, a49
73+
; CHECK-NEXT: v_accvgpr_read_b32 v15, a48
74+
; CHECK-NEXT: v_accvgpr_read_b32 v16, a47
75+
; CHECK-NEXT: v_accvgpr_read_b32 v17, a46
76+
; CHECK-NEXT: v_accvgpr_read_b32 v18, a45
77+
; CHECK-NEXT: v_accvgpr_read_b32 v19, a44
78+
; CHECK-NEXT: v_accvgpr_read_b32 v20, a43
79+
; CHECK-NEXT: v_accvgpr_read_b32 v21, a42
80+
; CHECK-NEXT: v_accvgpr_read_b32 v22, a41
81+
; CHECK-NEXT: v_accvgpr_read_b32 v23, a40
82+
; CHECK-NEXT: v_accvgpr_read_b32 v24, a39
83+
; CHECK-NEXT: v_accvgpr_read_b32 v25, a38
84+
; CHECK-NEXT: v_accvgpr_read_b32 v26, a37
85+
; CHECK-NEXT: v_accvgpr_read_b32 v27, a36
86+
; CHECK-NEXT: v_accvgpr_read_b32 v28, a35
87+
; CHECK-NEXT: v_accvgpr_read_b32 v29, a34
88+
; CHECK-NEXT: v_accvgpr_mov_b32 a2, a32
89+
; CHECK-NEXT: v_accvgpr_mov_b32 a3, a33
90+
; CHECK-NEXT: v_accvgpr_write_b32 a4, v29
91+
; CHECK-NEXT: v_accvgpr_write_b32 a5, v28
92+
; CHECK-NEXT: v_accvgpr_write_b32 a6, v27
93+
; CHECK-NEXT: v_accvgpr_write_b32 a7, v26
94+
; CHECK-NEXT: v_accvgpr_write_b32 a8, v25
95+
; CHECK-NEXT: v_accvgpr_write_b32 a9, v24
96+
; CHECK-NEXT: v_accvgpr_write_b32 a10, v23
97+
; CHECK-NEXT: v_accvgpr_write_b32 a11, v22
98+
; CHECK-NEXT: v_accvgpr_write_b32 a12, v21
99+
; CHECK-NEXT: v_accvgpr_write_b32 a13, v20
100+
; CHECK-NEXT: v_accvgpr_write_b32 a14, v19
101+
; CHECK-NEXT: v_accvgpr_write_b32 a15, v18
102+
; CHECK-NEXT: v_accvgpr_write_b32 a16, v17
103+
; CHECK-NEXT: v_accvgpr_write_b32 a17, v16
104+
; CHECK-NEXT: v_accvgpr_write_b32 a18, v15
105+
; CHECK-NEXT: v_accvgpr_write_b32 a19, v14
106+
; CHECK-NEXT: v_accvgpr_write_b32 a20, v13
107+
; CHECK-NEXT: v_accvgpr_write_b32 a21, v12
108+
; CHECK-NEXT: v_accvgpr_write_b32 a22, v11
109+
; CHECK-NEXT: v_accvgpr_write_b32 a23, v10
110+
; CHECK-NEXT: v_accvgpr_write_b32 a24, v9
111+
; CHECK-NEXT: v_accvgpr_write_b32 a25, v8
112+
; CHECK-NEXT: v_accvgpr_write_b32 a26, v7
113+
; CHECK-NEXT: v_accvgpr_write_b32 a27, v6
114+
; CHECK-NEXT: v_accvgpr_write_b32 a28, v5
115+
; CHECK-NEXT: v_accvgpr_write_b32 a29, v4
116+
; CHECK-NEXT: v_accvgpr_mov_b32 a30, a60
117+
; CHECK-NEXT: v_accvgpr_mov_b32 a31, a61
118+
; CHECK-NEXT: s_nop 1
119+
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
120+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
121+
; CHECK-NEXT: s_nop 7
122+
; CHECK-NEXT: s_nop 7
123+
; CHECK-NEXT: s_nop 1
124+
; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
125+
; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
126+
; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
127+
; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
128+
; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
129+
; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
130+
; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
131+
; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
132+
; CHECK-NEXT: s_endpgm
133+
bb:
134+
%id = call i32 @llvm.amdgcn.workitem.id.x()
135+
%gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
136+
%in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
137+
%mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
138+
%mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
139+
%tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> <i32 32, i32 33, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29>
140+
%mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0)
141+
store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
142+
ret void
143+
}
144+
145+
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(ptr addrspace(1) %arg) #0 {
146+
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle:
147+
; CHECK: ; %bb.0: ; %bb
148+
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
149+
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
150+
; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
151+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
152+
; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
153+
; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
154+
; CHECK-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
155+
; CHECK-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
156+
; CHECK-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
157+
; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
158+
; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
159+
; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
160+
; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
161+
; CHECK-NEXT: s_waitcnt vmcnt(0)
162+
; CHECK-NEXT: s_nop 0
163+
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
164+
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
165+
; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
166+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
167+
; CHECK-NEXT: s_nop 7
168+
; CHECK-NEXT: s_nop 7
169+
; CHECK-NEXT: s_nop 1
170+
; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
171+
; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
172+
; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
173+
; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
174+
; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
175+
; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
176+
; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
177+
; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
178+
; CHECK-NEXT: s_endpgm
179+
bb:
180+
%id = call i32 @llvm.amdgcn.workitem.id.x()
181+
%gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
182+
%in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
183+
%mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
184+
%mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
185+
%mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0)
186+
store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
187+
ret void
188+
}
189+
190+
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #1
191+
declare noundef i32 @llvm.amdgcn.workitem.id.x() #2
192+
193+
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,4" }
194+
attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
195+
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

0 commit comments

Comments
 (0)