|
1 | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2 | 2 | ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
|
3 |
| -; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI |
| 3 | +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX89,VI |
| 4 | +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -| FileCheck %s --check-prefixes=GFX89,GFX9 |
4 | 5 |
|
5 | 6 | ; XXX - Why the packing?
|
6 | 7 | define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
|
@@ -43,6 +44,27 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
|
43 | 44 | ; VI-NEXT: v_mov_b32_e32 v1, v0
|
44 | 45 | ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
45 | 46 | ; VI-NEXT: s_endpgm
|
| 47 | +; |
| 48 | +; GFX9-LABEL: scalar_to_vector_v2i32: |
| 49 | +; GFX9: ; %bb.0: |
| 50 | +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| 51 | +; GFX9-NEXT: s_mov_b32 s7, 0xf000 |
| 52 | +; GFX9-NEXT: s_mov_b32 s6, -1 |
| 53 | +; GFX9-NEXT: s_mov_b32 s10, s6 |
| 54 | +; GFX9-NEXT: s_mov_b32 s11, s7 |
| 55 | +; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| 56 | +; GFX9-NEXT: s_mov_b32 s8, s2 |
| 57 | +; GFX9-NEXT: s_mov_b32 s9, s3 |
| 58 | +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 |
| 59 | +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 |
| 60 | +; GFX9-NEXT: s_mov_b32 s4, s0 |
| 61 | +; GFX9-NEXT: s_mov_b32 s5, s1 |
| 62 | +; GFX9-NEXT: s_waitcnt vmcnt(0) |
| 63 | +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 |
| 64 | +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 |
| 65 | +; GFX9-NEXT: v_mov_b32_e32 v1, v0 |
| 66 | +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 |
| 67 | +; GFX9-NEXT: s_endpgm |
46 | 68 | %tmp1 = load i32, ptr addrspace(1) %in, align 4
|
47 | 69 | %bc = bitcast i32 %tmp1 to <2 x i16>
|
48 | 70 | %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
|
@@ -90,6 +112,27 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
|
90 | 112 | ; VI-NEXT: v_mov_b32_e32 v1, v0
|
91 | 113 | ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
92 | 114 | ; VI-NEXT: s_endpgm
|
| 115 | +; |
| 116 | +; GFX9-LABEL: scalar_to_vector_v2f32: |
| 117 | +; GFX9: ; %bb.0: |
| 118 | +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 |
| 119 | +; GFX9-NEXT: s_mov_b32 s7, 0xf000 |
| 120 | +; GFX9-NEXT: s_mov_b32 s6, -1 |
| 121 | +; GFX9-NEXT: s_mov_b32 s10, s6 |
| 122 | +; GFX9-NEXT: s_mov_b32 s11, s7 |
| 123 | +; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| 124 | +; GFX9-NEXT: s_mov_b32 s8, s2 |
| 125 | +; GFX9-NEXT: s_mov_b32 s9, s3 |
| 126 | +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 |
| 127 | +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 |
| 128 | +; GFX9-NEXT: s_mov_b32 s4, s0 |
| 129 | +; GFX9-NEXT: s_mov_b32 s5, s1 |
| 130 | +; GFX9-NEXT: s_waitcnt vmcnt(0) |
| 131 | +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 |
| 132 | +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 |
| 133 | +; GFX9-NEXT: v_mov_b32_e32 v1, v0 |
| 134 | +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 |
| 135 | +; GFX9-NEXT: s_endpgm |
93 | 136 | %tmp1 = load float, ptr addrspace(1) %in, align 4
|
94 | 137 | %bc = bitcast float %tmp1 to <2 x i16>
|
95 | 138 | %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
|
@@ -130,6 +173,23 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
|
130 | 173 | ; VI-NEXT: v_mov_b32_e32 v1, s0
|
131 | 174 | ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
132 | 175 | ; VI-NEXT: s_endpgm
|
| 176 | +; |
| 177 | +; GFX9-LABEL: scalar_to_vector_v4i16: |
| 178 | +; GFX9: ; %bb.0: ; %bb |
| 179 | +; GFX9-NEXT: s_mov_b32 s3, 0xf000 |
| 180 | +; GFX9-NEXT: s_mov_b32 s2, -1 |
| 181 | +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 |
| 182 | +; GFX9-NEXT: s_waitcnt vmcnt(0) |
| 183 | +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| 184 | +; GFX9-NEXT: s_lshl_b32 s1, s0, 8 |
| 185 | +; GFX9-NEXT: s_or_b32 s0, s0, s1 |
| 186 | +; GFX9-NEXT: s_and_b32 s1, s0, 0xffff |
| 187 | +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 |
| 188 | +; GFX9-NEXT: s_or_b32 s0, s1, s0 |
| 189 | +; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| 190 | +; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| 191 | +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 |
| 192 | +; GFX9-NEXT: s_endpgm |
133 | 193 | bb:
|
134 | 194 | %tmp = load <2 x i8>, ptr addrspace(1) undef, align 1
|
135 | 195 | %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
@@ -176,6 +236,28 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
|
176 | 236 | ; VI-NEXT: v_mov_b32_e32 v1, s1
|
177 | 237 | ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
178 | 238 | ; VI-NEXT: s_endpgm
|
| 239 | +; |
| 240 | +; GFX9-LABEL: scalar_to_vector_v4f16: |
| 241 | +; GFX9: ; %bb.0: ; %bb |
| 242 | +; GFX9-NEXT: s_mov_b32 s3, 0xf000 |
| 243 | +; GFX9-NEXT: s_mov_b32 s2, -1 |
| 244 | +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 |
| 245 | +; GFX9-NEXT: s_waitcnt vmcnt(0) |
| 246 | +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| 247 | +; GFX9-NEXT: s_lshl_b32 s1, s0, 8 |
| 248 | +; GFX9-NEXT: s_or_b32 s0, s1, s0 |
| 249 | +; GFX9-NEXT: s_and_b32 s1, s0, 0xff00 |
| 250 | +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x80008 |
| 251 | +; GFX9-NEXT: s_or_b32 s1, s4, s1 |
| 252 | +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff |
| 253 | +; GFX9-NEXT: s_and_b32 s4, s1, 0xffff |
| 254 | +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 |
| 255 | +; GFX9-NEXT: s_or_b32 s4, s4, s1 |
| 256 | +; GFX9-NEXT: s_or_b32 s0, s0, s1 |
| 257 | +; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| 258 | +; GFX9-NEXT: v_mov_b32_e32 v1, s4 |
| 259 | +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 |
| 260 | +; GFX9-NEXT: s_endpgm |
179 | 261 | bb:
|
180 | 262 | %load = load half, ptr addrspace(1) undef, align 1
|
181 | 263 | %tmp = bitcast half %load to <2 x i8>
|
@@ -235,16 +317,16 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero
|
235 | 317 | ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
236 | 318 | ; SI-NEXT: s_endpgm
|
237 | 319 | ;
|
238 |
| -; VI-LABEL: scalar_to_vector_test6: |
239 |
| -; VI: ; %bb.0: |
240 |
| -; VI-NEXT: s_load_dword s6, s[4:5], 0x2c |
241 |
| -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
242 |
| -; VI-NEXT: s_mov_b32 s3, 0xf000 |
243 |
| -; VI-NEXT: s_mov_b32 s2, -1 |
244 |
| -; VI-NEXT: s_waitcnt lgkmcnt(0) |
245 |
| -; VI-NEXT: v_mov_b32_e32 v0, s6 |
246 |
| -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
247 |
| -; VI-NEXT: s_endpgm |
| 320 | +; GFX89-LABEL: scalar_to_vector_test6: |
| 321 | +; GFX89: ; %bb.0: |
| 322 | +; GFX89-NEXT: s_load_dword s6, s[4:5], 0x2c |
| 323 | +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 |
| 324 | +; GFX89-NEXT: s_mov_b32 s3, 0xf000 |
| 325 | +; GFX89-NEXT: s_mov_b32 s2, -1 |
| 326 | +; GFX89-NEXT: s_waitcnt lgkmcnt(0) |
| 327 | +; GFX89-NEXT: v_mov_b32_e32 v0, s6 |
| 328 | +; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| 329 | +; GFX89-NEXT: s_endpgm |
248 | 330 | %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
|
249 | 331 | %bc = bitcast <4 x i8> %newvec0 to <2 x half>
|
250 | 332 | store <2 x half> %bc, ptr addrspace(1) %out
|
|
0 commit comments