-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Auto-generating lit test patterns (NFC) #93837
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Test CodeGen/AMDGPU/build_vector.ll has the lit patterns partially hand-written and the rest auto-generated. It doesn't look good when changes are required with future patches. Auto-generating the entire pattern.
@llvm/pr-subscribers-backend-amdgpu Author: Christudasan Devadasan (cdevadas) ChangesTest CodeGen/AMDGPU/build_vector.ll has the lit patterns partially hand-written and the rest auto-generated. It doesn't look good when changes are required with future patches. Auto-generating the entire pattern. Full diff: https://github.com/llvm/llvm-project/pull/93837.diff 1 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 99755133f36d6..f23cd0d345104 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -1,95 +1,311 @@
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=R600,ALL
-; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=GFX6,GFX678,ALL
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL
-; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10,GFX1011,ALL
-; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940,ALL
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=R600
+; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8
+; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10
+; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940
-; ALL-LABEL: {{^}}build_vector2:
-; R600: MOV
-; R600: MOV
-; R600-NOT: MOV
-; GFX678-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
-; GFX678-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; GFX1011-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
-; GFX1011-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; GFX678: buffer_store_dwordx2 v[[[X]]:[[Y]]]
-; GFX10: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX11: global_store_b64 v2, v[0:1], s[0:1]
define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector2:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.Y, literal.x,
+; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; R600-NEXT: MOV T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45)
+;
+; GCN-LABEL: build_vector2:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, 5
+; GCN-NEXT: v_mov_b32_e32 v1, 6
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector2:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, 6
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector2:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, 6
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector2:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, 5
+; GFX11-NEXT: v_mov_b32_e32 v1, 6
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector2:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, 6
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
ret void
}
-; ALL-LABEL: {{^}}build_vector4:
-; R600: MOV
-; R600: MOV
-; R600: MOV
-; R600: MOV
-; R600-NOT: MOV
-; GFX678-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
-; GFX678-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; GFX678-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
-; GFX678-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
-; GFX1011-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
-; GFX1011-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; GFX1011-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
-; GFX1011-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
-; GFX678: buffer_store_dwordx4 v[[[X]]:[[W]]]
-; GFX10: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX11: global_store_b128 v4, v[0:3], s[0:1]
define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector4:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, literal.x,
+; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; R600-NEXT: MOV * T0.Z, literal.x,
+; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; R600-NEXT: MOV * T0.Y, literal.x,
+; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; R600-NEXT: MOV T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45)
+;
+; GCN-LABEL: build_vector4:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, 5
+; GCN-NEXT: v_mov_b32_e32 v1, 6
+; GCN-NEXT: v_mov_b32_e32 v2, 7
+; GCN-NEXT: v_mov_b32_e32 v3, 8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector4:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, 6
+; GFX8-NEXT: v_mov_b32_e32 v2, 7
+; GFX8-NEXT: v_mov_b32_e32 v3, 8
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector4:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, 6
+; GFX10-NEXT: v_mov_b32_e32 v2, 7
+; GFX10-NEXT: v_mov_b32_e32 v3, 8
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector4:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, 5
+; GFX11-NEXT: v_mov_b32_e32 v1, 6
+; GFX11-NEXT: v_mov_b32_e32 v2, 7
+; GFX11-NEXT: v_mov_b32_e32 v3, 8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector4:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, 6
+; GFX940-NEXT: v_mov_b32_e32 v2, 7
+; GFX940-NEXT: v_mov_b32_e32 v3, 8
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
ret void
}
-
-; ALL-LABEL: {{^}}build_vector_v2i16:
-; R600: MOV
-; R600-NOT: MOV
-; GFX678: s_mov_b32 s3, 0xf000
-; GFX678: s_mov_b32 s2, -1
-; GFX678: v_mov_b32_e32 v0, 0x60005
-; GFX678: s_waitcnt lgkmcnt(0)
-; GFX678: buffer_store_dword v0, off, s[0:3], 0
-; GFX1011: v_mov_b32_e32 v0, 0
-; GFX1011: v_mov_b32_e32 v1, 0x60005
-; GFX1011: s_waitcnt lgkmcnt(0)
-; GFX10: global_store_dword v0, v1, s[0:1]
-; GFX11: global_store_b32 v0, v1, s[0:1]
define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector_v2i16:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV T4.X, literal.x,
+; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; R600-NEXT: 393221(5.510200e-40), 2(2.802597e-45)
+;
+; GCN-LABEL: build_vector_v2i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x60005
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector_v2i16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x60005
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector_v2i16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector_v2i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector_v2i16:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
ret void
}
-; ALL-LABEL: {{^}}build_vector_v2i16_trunc:
-; R600: LSHR
-; R600: OR_INT
-; R600: LSHR
-; R600-NOT: MOV
-; GFX6: s_mov_b32 s3, 0xf000
-; GFX6: s_waitcnt lgkmcnt(0)
-; GFX6: v_alignbit_b32 v0, 5, s4, 16
-; GFX6: buffer_store_dword v0, off, s[0:3], 0
-; GFX8: s_mov_b32 s3, 0xf000
-; GFX8: s_mov_b32 s2, -1
-; GFX8: s_waitcnt lgkmcnt(0)
-; GFX8: s_lshr_b32 s4, s4, 16
-; GFX8: s_or_b32 s4, s4, 0x50000
-; GFX8: v_mov_b32_e32 v0, s4
-; GFX8: buffer_store_dword v0, off, s[0:3], 0
-; GFX1011: v_mov_b32_e32 v0, 0
-; GFX1011: s_waitcnt lgkmcnt(0)
-; GFX10: s_lshr_b32 s2, s2, 16
-; GFX10: s_pack_ll_b32_b16 s2, s2, 5
-; GFX11: s_pack_hl_b32_b16 s2, s2, 5
-; GFX1011: v_mov_b32_e32 v1, s2
-; GFX10: global_store_dword v0, v1, s[0:1]
-; GFX11: global_store_b32 v0, v1, s[0:1]
define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) {
+; R600-LABEL: build_vector_v2i16_trunc:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: OR_INT T4.X, PV.W, literal.x,
+; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; R600-NEXT: 327680(4.591775e-40), 2(2.802597e-45)
+;
+; GCN-LABEL: build_vector_v2i16_trunc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s4, s[0:1], 0xb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_alignbit_b32 v0, 5, s4, 16
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector_v2i16_trunc:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshr_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s4, s4, 0x50000
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector_v2i16_trunc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x8
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshr_b32 s2, s2, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector_v2i16_trunc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector_v2i16_trunc:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
%srl = lshr i32 %a, 16
%trunc = trunc i32 %srl to i16
%ins.0 = insertelement <2 x i16> undef, i16 %trunc, i32 0
@@ -98,6 +314,7 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
ret void
}
+define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
; R600-LABEL: build_v2i32_from_v4i16_shuffle:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 0, @10, KC0[], KC1[]
@@ -118,20 +335,20 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
-; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
-; GFX6: ; %bb.0: ; %entry
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_lshl_b32 s2, s2, 16
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX6-NEXT: s_endpgm
+; GCN-LABEL: build_v2i32_from_v4i16_shuffle:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s3, s3, 16
+; GCN-NEXT: s_lshl_b32 s2, s2, 16
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
;
; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
; GFX8: ; %bb.0: ; %entry
@@ -185,7 +402,6 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_endpgm
-define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
%zextended = zext <2 x i16> %shuf to <2 x i32>
|
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=R600 | ||
; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=GCN | ||
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8 | ||
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10 | ||
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11 | ||
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test content could probably use some cleanup and regularization too. Might also want to split out r600?
If you are generating checks, there's no reason to keep -mattr=-flat-for-global
Should drop the -verify-machineinstrs. Should also try adding common prefixes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test content could probably use some cleanup and regularization too. Might also want to split out r600?
I don't the original intention of the test to do further clean up. What do you mean by split out r600?
If you are generating checks, there's no reason to keep -mattr=-flat-for-global
Why?
Should drop the -verify-machineinstrs. Should also try adding common prefixes?
Why not -verify-machineinstrs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also try adding common prefixes?
The common prefixes didn't make any difference. So I removed them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test content could probably use some cleanup and regularization too. Might also want to split out r600?
I don't the original intention of the test to do further clean up. What do you mean by split out r600?
I mean have a separate r600 version of the test so we don't have mixed r600 and amdgcn run lines
If you are generating checks, there's no reason to keep -mattr=-flat-for-global
Why?
Because the main reason we have this anywhere was to make the gfx6/7 output closer to match gfx8 when handwriting checks
Should drop the -verify-machineinstrs. Should also try adding common prefixes?
Why not -verify-machineinstrs?
It's very expensive. It is on by default in EXPENSIVE_CHECKS builds, so we should only be using it in targeted tests for verifier errors and not just blindly add it to every test as we've usually done
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8 | ||
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10 | ||
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11 | ||
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might as well drop the verifies
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL | ||
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940,ALL | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc < %s -mtriple=amdgcn | FileCheck %s --check-prefixes=GCN |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might as well switch this to using -mcpu=tahiti
Test CodeGen/AMDGPU/build_vector.ll has the lit patterns partially hand-written and the rest auto-generated. It doesn't look good when changes are required with future patches. Auto-generating the entire pattern.