-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 #121510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
0e3d476
to
a31d29a
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesV_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together. This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test Patch is 169.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121510.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e457674c07a1..d00c810859e3b8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
-defm V_MIN3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
-defm V_MAX3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
+defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">;
+defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>;
defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0dbbaea984..fbcdbed338e60c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; GFX11-NEXT: v_pk_max_f16 v0, v2, v0
; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e044df93..269fd52df5c49f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; GFX11-NEXT: v_pk_min_f16 v0, v2, v0
; GFX11-NEXT: v_pk_min_f16 v0, v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
%min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s6
+; GFX12-NEXT: s_mov_b32 s13, s7
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile double, ptr addrspace(1) %aptr, align 4
%b = load volatile double, ptr addrspace(1) %bptr, align 4
%c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+...
[truncated]
|
@llvm/pr-subscribers-mc Author: Brox Chen (broxigarchen) ChangesV_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together. This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test Patch is 169.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121510.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e457674c07a1..d00c810859e3b8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
-defm V_MIN3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
-defm V_MAX3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
+defm V_MIN3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">;
+defm V_MAX3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>;
defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0dbbaea984..fbcdbed338e60c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; GFX11-NEXT: v_pk_max_f16 v0, v2, v0
; GFX11-NEXT: v_pk_max_f16 v0, v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e044df93..269fd52df5c49f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile float, ptr addrspace(1) %aptr, align 4
%b = load volatile float, ptr addrspace(1) %bptr, align 4
%c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1
; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_mov_b32 s22, s10
+; GFX12-NEXT: s_mov_b32 s23, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: s_mov_b32 s20, s6
+; GFX12-NEXT: s_mov_b32 s21, s7
+; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT: buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile half, ptr addrspace(1) %aptr, align 2
%b = load volatile half, ptr addrspace(1) %bptr, align 2
%c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
; GFX11-NEXT: v_pk_min_f16 v0, v2, v0
; GFX11-NEXT: v_pk_min_f16 v0, v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
%min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mov_b32 s16, s4
+; GFX12-NEXT: s_mov_b32 s17, s5
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s6
+; GFX12-NEXT: s_mov_b32 s13, s7
+; GFX12-NEXT: s_mov_b32 s8, s0
+; GFX12-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_mov_b32 s9, s1
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: s_endpgm
%a = load volatile double, ptr addrspace(1) %aptr, align 4
%b = load volatile double, ptr addrspace(1) %bptr, align 4
%c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT: s_mov_b32 s10, -1
+; GFX12-NEXT: s_mov_b32 s11, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s10
+; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_mov_b32 s18, s10
+; GFX12-NEXT: s_mov_b32 s19, s11
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s12, s2
+...
[truncated]
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/92/builds/12090 Here is the relevant piece of the build log for the reference
|
V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together.
This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test