Skip to content

Commit cc98b35

Browse files
authored
[AMDGPU] Masked load vectortype test (#129703)
1 parent 31845cf commit cc98b35

File tree

1 file changed

+255
-0
lines changed

1 file changed

+255
-0
lines changed
Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=GFX942 %s
3+
4+
define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
5+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v2i32:
6+
; GFX942: ; %bb.0: ; %entry
7+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
9+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
10+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
11+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
12+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
13+
; GFX942-NEXT: s_cbranch_execz .LBB0_2
14+
; GFX942-NEXT: ; %bb.1: ; %cond.load
15+
; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
16+
; GFX942-NEXT: .LBB0_2:
17+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
18+
; GFX942-NEXT: s_waitcnt vmcnt(0)
19+
; GFX942-NEXT: s_setpc_b64 s[30:31]
20+
entry:
21+
%partialmaskvec = insertelement <2 x i1> poison, i1 %mask, i64 0
22+
%maskvec = shufflevector <2 x i1> %partialmaskvec, <2 x i1> poison, <2 x i32> zeroinitializer
23+
%result = tail call <2 x i32> @llvm.masked.load.v2i32.p1(ptr addrspace(1) %ptr, i32 2, <2 x i1> %maskvec, <2 x i32> zeroinitializer)
24+
ret <2 x i32> %result
25+
}
26+
27+
define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
28+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v4i32:
29+
; GFX942: ; %bb.0: ; %entry
30+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
32+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
33+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
34+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
35+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
36+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
37+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
38+
; GFX942-NEXT: s_cbranch_execz .LBB1_2
39+
; GFX942-NEXT: ; %bb.1: ; %cond.load
40+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
41+
; GFX942-NEXT: .LBB1_2:
42+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
43+
; GFX942-NEXT: s_waitcnt vmcnt(0)
44+
; GFX942-NEXT: s_setpc_b64 s[30:31]
45+
entry:
46+
%partialmaskvec = insertelement <4 x i1> poison, i1 %mask, i64 0
47+
%maskvec = shufflevector <4 x i1> %partialmaskvec, <4 x i1> poison, <4 x i32> zeroinitializer
48+
%result = tail call <4 x i32> @llvm.masked.load.v4i32.p1(ptr addrspace(1) %ptr, i32 4, <4 x i1> %maskvec, <4 x i32> zeroinitializer)
49+
ret <4 x i32> %result
50+
}
51+
52+
define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
53+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v4f32:
54+
; GFX942: ; %bb.0: ; %entry
55+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
57+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
58+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
59+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
60+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
61+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
62+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
63+
; GFX942-NEXT: s_cbranch_execz .LBB2_2
64+
; GFX942-NEXT: ; %bb.1: ; %cond.load
65+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
66+
; GFX942-NEXT: .LBB2_2:
67+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
68+
; GFX942-NEXT: s_waitcnt vmcnt(0)
69+
; GFX942-NEXT: s_setpc_b64 s[30:31]
70+
entry:
71+
%partialmaskvec = insertelement <4 x i1> poison, i1 %mask, i64 0
72+
%maskvec = shufflevector <4 x i1> %partialmaskvec, <4 x i1> poison, <4 x i32> zeroinitializer
73+
%result = tail call <4 x float> @llvm.masked.load.v4f32.p1(ptr addrspace(1) %ptr, i32 4, <4 x i1> %maskvec, <4 x float> zeroinitializer)
74+
ret <4 x float> %result
75+
}
76+
77+
define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
78+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8i32:
79+
; GFX942: ; %bb.0: ; %entry
80+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
82+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
83+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
84+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
85+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
86+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
87+
; GFX942-NEXT: v_mov_b32_e32 v4, v0
88+
; GFX942-NEXT: v_mov_b32_e32 v5, v0
89+
; GFX942-NEXT: v_mov_b32_e32 v6, v0
90+
; GFX942-NEXT: v_mov_b32_e32 v7, v0
91+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
92+
; GFX942-NEXT: s_cbranch_execz .LBB3_2
93+
; GFX942-NEXT: ; %bb.1: ; %cond.load
94+
; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
95+
; GFX942-NEXT: s_nop 0
96+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
97+
; GFX942-NEXT: .LBB3_2:
98+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
99+
; GFX942-NEXT: s_waitcnt vmcnt(0)
100+
; GFX942-NEXT: s_setpc_b64 s[30:31]
101+
entry:
102+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i64 0
103+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
104+
%result = tail call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x i32> zeroinitializer)
105+
ret <8 x i32> %result
106+
}
107+
108+
define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
109+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8f32:
110+
; GFX942: ; %bb.0: ; %entry
111+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
113+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
114+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
115+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
116+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
117+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
118+
; GFX942-NEXT: v_mov_b32_e32 v4, v0
119+
; GFX942-NEXT: v_mov_b32_e32 v5, v0
120+
; GFX942-NEXT: v_mov_b32_e32 v6, v0
121+
; GFX942-NEXT: v_mov_b32_e32 v7, v0
122+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
123+
; GFX942-NEXT: s_cbranch_execz .LBB4_2
124+
; GFX942-NEXT: ; %bb.1: ; %cond.load
125+
; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
126+
; GFX942-NEXT: s_nop 0
127+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
128+
; GFX942-NEXT: .LBB4_2:
129+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
130+
; GFX942-NEXT: s_waitcnt vmcnt(0)
131+
; GFX942-NEXT: s_setpc_b64 s[30:31]
132+
entry:
133+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i64 0
134+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
135+
%result = tail call <8 x float> @llvm.masked.load.v8f32.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x float> zeroinitializer)
136+
ret <8 x float> %result
137+
}
138+
139+
define <8 x i16> @uniform_masked_load_ptr1_mask_v8i16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
140+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8i16:
141+
; GFX942: ; %bb.0: ; %entry
142+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
144+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
145+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
146+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
147+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
148+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
149+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
150+
; GFX942-NEXT: s_cbranch_execz .LBB5_2
151+
; GFX942-NEXT: ; %bb.1: ; %cond.load
152+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
153+
; GFX942-NEXT: .LBB5_2:
154+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
155+
; GFX942-NEXT: s_waitcnt vmcnt(0)
156+
; GFX942-NEXT: s_setpc_b64 s[30:31]
157+
entry:
158+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i16 0
159+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
160+
%result = tail call <8 x i16> @llvm.masked.load.v8i16.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x i16> zeroinitializer)
161+
ret <8 x i16> %result
162+
}
163+
164+
define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
165+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8f16:
166+
; GFX942: ; %bb.0: ; %entry
167+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
169+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
170+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
171+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
172+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
173+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
174+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
175+
; GFX942-NEXT: s_cbranch_execz .LBB6_2
176+
; GFX942-NEXT: ; %bb.1: ; %cond.load
177+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
178+
; GFX942-NEXT: .LBB6_2:
179+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
180+
; GFX942-NEXT: s_waitcnt vmcnt(0)
181+
; GFX942-NEXT: s_setpc_b64 s[30:31]
182+
entry:
183+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i16 0
184+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
185+
%result = tail call <8 x half> @llvm.masked.load.v8f16.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x half> zeroinitializer)
186+
ret <8 x half> %result
187+
}
188+
189+
define <8 x bfloat> @uniform_masked_load_ptr1_mask_v8bf16(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
190+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v8bf16:
191+
; GFX942: ; %bb.0: ; %entry
192+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
194+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
195+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
196+
; GFX942-NEXT: v_mov_b32_e32 v1, v0
197+
; GFX942-NEXT: v_mov_b32_e32 v2, v0
198+
; GFX942-NEXT: v_mov_b32_e32 v3, v0
199+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
200+
; GFX942-NEXT: s_cbranch_execz .LBB7_2
201+
; GFX942-NEXT: ; %bb.1: ; %cond.load
202+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
203+
; GFX942-NEXT: .LBB7_2:
204+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
205+
; GFX942-NEXT: s_waitcnt vmcnt(0)
206+
; GFX942-NEXT: s_setpc_b64 s[30:31]
207+
entry:
208+
%partialmaskvec = insertelement <8 x i1> poison, i1 %mask, i32 0
209+
%maskvec = shufflevector <8 x i1> %partialmaskvec, <8 x i1> poison, <8 x i32> zeroinitializer
210+
%result = tail call <8 x bfloat> @llvm.masked.load.v8bf16.p1(ptr addrspace(1) %ptr, i32 4, <8 x i1> %maskvec, <8 x bfloat> zeroinitializer)
211+
ret <8 x bfloat> %result
212+
}
213+
214+
define <16 x i8> @uniform_masked_load_ptr1_mask_v16i8(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) {
215+
; GFX942-LABEL: uniform_masked_load_ptr1_mask_v16i8:
216+
; GFX942: ; %bb.0: ; %entry
217+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218+
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
219+
; GFX942-NEXT: v_mov_b32_e32 v16, 0
220+
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
221+
; GFX942-NEXT: v_mov_b32_e32 v17, v16
222+
; GFX942-NEXT: v_mov_b32_e32 v18, v16
223+
; GFX942-NEXT: v_mov_b32_e32 v19, v16
224+
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
225+
; GFX942-NEXT: s_cbranch_execz .LBB8_2
226+
; GFX942-NEXT: ; %bb.1: ; %cond.load
227+
; GFX942-NEXT: global_load_dwordx4 v[16:19], v16, s[0:1]
228+
; GFX942-NEXT: .LBB8_2:
229+
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
230+
; GFX942-NEXT: s_waitcnt vmcnt(0)
231+
; GFX942-NEXT: v_lshrrev_b64 v[20:21], 24, v[16:17]
232+
; GFX942-NEXT: v_lshrrev_b64 v[22:23], 24, v[18:19]
233+
; GFX942-NEXT: v_lshrrev_b32_e32 v1, 8, v16
234+
; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v16
235+
; GFX942-NEXT: v_lshrrev_b32_e32 v5, 8, v17
236+
; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v17
237+
; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v17
238+
; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v18
239+
; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v18
240+
; GFX942-NEXT: v_lshrrev_b32_e32 v13, 8, v19
241+
; GFX942-NEXT: v_lshrrev_b32_e32 v14, 16, v19
242+
; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v19
243+
; GFX942-NEXT: v_mov_b32_e32 v0, v16
244+
; GFX942-NEXT: v_mov_b32_e32 v3, v20
245+
; GFX942-NEXT: v_mov_b32_e32 v4, v17
246+
; GFX942-NEXT: v_mov_b32_e32 v8, v18
247+
; GFX942-NEXT: v_mov_b32_e32 v11, v22
248+
; GFX942-NEXT: v_mov_b32_e32 v12, v19
249+
; GFX942-NEXT: s_setpc_b64 s[30:31]
250+
entry:
251+
%partialmaskvec = insertelement <16 x i1> poison, i1 %mask, i32 0
252+
%maskvec = shufflevector <16 x i1> %partialmaskvec, <16 x i1> poison, <16 x i32> zeroinitializer
253+
%result = tail call <16 x i8> @llvm.masked.load.v16i8.p1(ptr addrspace(1) %ptr, i32 4, <16 x i1> %maskvec, <16 x i8> zeroinitializer)
254+
ret <16 x i8> %result
255+
}

0 commit comments

Comments
 (0)