Skip to content

[NFC][AMDGPU] Auto generate check lines for llvm/test/CodeGen/AMDGPU/packed-fp32.ll #131629

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 17, 2025

Conversation

shiltian
Copy link
Contributor

No description provided.

Copy link
Contributor Author

This stack of pull requests is managed by Graphite. Learn more about stacking.

@llvmbot
Copy link
Member

llvmbot commented Mar 17, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

Changes

Patch is 114.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131629.diff

1 Files Affected:

  • (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+1810-193)
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 2004e1eb061bf..28a995e74f7ab 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1,13 +1,34 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s
-
-; GCN-LABEL: {{^}}fadd_v2_vv:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; PACKED:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+
 define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_vv:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-LABEL: fadd_v2_vv:
+; PACKED:       ; %bb.0:
+; PACKED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[0:1]
+; PACKED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -16,10 +37,30 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2_vs:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
+; GFX900-LABEL: fadd_v2_vs:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v1, s3, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-LABEL: fadd_v2_vs:
+; PACKED:       ; %bb.0:
+; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -28,10 +69,49 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v4_vs:
-; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
+; GFX900-LABEL: fadd_v4_vs:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v3, s3, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX900-NEXT:    v_add_f32_e32 v1, s1, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX900-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v4_vs:
+; PACKED-SDAG:       ; %bb.0:
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-SDAG-NEXT:    s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v4_vs:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -40,10 +120,163 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v32_vs:
-; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
+; GFX900-LABEL: fadd_v32_vs:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
+; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
+; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
+; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
+; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
+; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
+; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v4, s43, v4
+; GFX900-NEXT:    v_add_f32_e32 v3, s42, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, s41, v2
+; GFX900-NEXT:    v_add_f32_e32 v1, s40, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
+; GFX900-NEXT:    v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT:    v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT:    v_add_f32_e32 v6, s37, v6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v32, s19, v32
+; GFX900-NEXT:    v_add_f32_e32 v31, s18, v31
+; GFX900-NEXT:    v_add_f32_e32 v30, s17, v30
+; GFX900-NEXT:    v_add_f32_e32 v29, s16, v29
+; GFX900-NEXT:    v_add_f32_e32 v5, s36, v5
+; GFX900-NEXT:    v_add_f32_e32 v12, s51, v12
+; GFX900-NEXT:    v_add_f32_e32 v11, s50, v11
+; GFX900-NEXT:    v_add_f32_e32 v10, s49, v10
+; GFX900-NEXT:    v_add_f32_e32 v9, s48, v9
+; GFX900-NEXT:    v_add_f32_e32 v16, s47, v16
+; GFX900-NEXT:    v_add_f32_e32 v15, s46, v15
+; GFX900-NEXT:    v_add_f32_e32 v14, s45, v14
+; GFX900-NEXT:    v_add_f32_e32 v13, s44, v13
+; GFX900-NEXT:    v_add_f32_e32 v20, s15, v20
+; GFX900-NEXT:    v_add_f32_e32 v19, s14, v19
+; GFX900-NEXT:    v_add_f32_e32 v18, s13, v18
+; GFX900-NEXT:    v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT:    v_add_f32_e32 v24, s11, v24
+; GFX900-NEXT:    v_add_f32_e32 v23, s10, v23
+; GFX900-NEXT:    v_add_f32_e32 v22, s9, v22
+; GFX900-NEXT:    v_add_f32_e32 v21, s8, v21
+; GFX900-NEXT:    v_add_f32_e32 v28, s23, v28
+; GFX900-NEXT:    v_add_f32_e32 v27, s22, v27
+; GFX900-NEXT:    v_add_f32_e32 v26, s21, v26
+; GFX900-NEXT:    v_add_f32_e32 v25, s20, v25
+; GFX900-NEXT:    global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
+; GFX900-NEXT:    global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
+; GFX900-NEXT:    global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
+; GFX900-NEXT:    global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
+; GFX900-NEXT:    global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
+; GFX900-NEXT:    global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
+; GFX900-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1]
+; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v32_vs:
+; PACKED-SDAG:       ; %bb.0:
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[38:39]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[48:49]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[50:51]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[44:45]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[46:47]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[16:17]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[18:19]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[12:13]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[14:15]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[10:11]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[20:21]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[22:23]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[36:37]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[8:9]
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; PACKED-SDAG-NEXT:    s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v32_vs:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[36:37]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[38:39]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[40:41]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[42:43]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[44:45]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[46:47]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[48:49]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[50:51]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[8:9]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[10:11]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[12:13]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[14:15]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[16:17]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[18:19]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[20:21]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[22:23]
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -53,13 +286,45 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 }
 
 ; FIXME: GISel does not use op_sel for splat constants.
-
-; GCN-LABEL: {{^}}fadd_v2_v_imm:
-; PACKED:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
-; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_imm:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v1, 0x42c80000, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, 0x42c80000, v0
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_imm:
+; PACKED-SDAG:       ; %bb.0:
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT:    s_mov_b32 s2, 0x42c80000
+; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT:    s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_imm:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -68,11 +333,43 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2_v_v_splat:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
-; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}}
 define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_v_splat:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v0
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v0
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_v_splat:
+; PACKED-SDAG:       ; %bb.0:
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
...
[truncated]

Copy link
Contributor

@Sisyph Sisyph left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@shiltian shiltian merged commit e2c43ba into main Mar 17, 2025
8 of 12 checks passed
@shiltian shiltian deleted the users/shiltian/autogen-for-packed-fp32 branch March 17, 2025 15:42
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants