[AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 #121510

broxigarchen · 2025-01-02T18:18:43Z

V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together.

This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test

llvmbot · 2025-01-02T18:38:08Z

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together.

This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test

Patch is 169.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121510.diff

10 Files Affected:

(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+134)
(modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+200)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+84-60)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s (+4-4)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s (+84-84)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s (+76-76)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+132-30)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt (+170-30)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt (+170-30)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e457674c07a1..d00c810859e3b8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
 
 defm V_MIN3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
 defm V_MAX3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
-defm V_MIN3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
-defm V_MAX3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
+defm V_MIN3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">;
+defm V_MAX3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
 defm V_MINIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22d>;
 defm V_MAXIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22e>;
 defm V_MINIMUM3_F16       : VOP3Only_Realtriple_t16_gfx12<0x22f>;
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0dbbaea984..fbcdbed338e60c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile  float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_max_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e044df93..269fd52df5c49f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_min_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s6
+; GFX12-NEXT:    s_mov_b32 s13, s7
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
   %c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+...
[truncated]

llvmbot · 2025-01-02T18:38:08Z

@llvm/pr-subscribers-mc

Author: Brox Chen (broxigarchen)

Changes

V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together.

This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test

Patch is 169.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121510.diff

10 Files Affected:

(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+134)
(modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+200)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+84-60)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s (+4-4)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s (+84-84)
(modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s (+76-76)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+132-30)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt (+170-30)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt (+170-30)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e457674c07a1..d00c810859e3b8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
 
 defm V_MIN3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
 defm V_MAX3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
-defm V_MIN3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
-defm V_MAX3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
+defm V_MIN3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">;
+defm V_MAX3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
 defm V_MINIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22d>;
 defm V_MAXIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22e>;
 defm V_MINIMUM3_F16       : VOP3Only_Realtriple_t16_gfx12<0x22f>;
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0dbbaea984..fbcdbed338e60c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile  float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_max_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e044df93..269fd52df5c49f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_min_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s6
+; GFX12-NEXT:    s_mov_b32 s13, s7
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
   %c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+...
[truncated]

llvm-ci · 2025-01-03T09:03:19Z

LLVM Buildbot has detected a new failure on builder bolt-x86_64-ubuntu-nfc running on bolt-worker while building llvm at step 9 "test-build-bolt-check-large-bolt".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/92/builds/12090

Here is the relevant piece of the build log for the reference

Step 9 (test-build-bolt-check-large-bolt) failure: test (failure)
******************** TEST 'bolt-tests :: X86/clang-nolbr.test' FAILED ********************
Exit Code: 254

Command Output (stderr):
--
RUN: at line 5: mkdir -p /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output
+ mkdir -p /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output
RUN: at line 6: test -f /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang || unzstd /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/clang.zst -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang
+ test -f /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang
RUN: at line 9: perf record -e cycles:u -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data --    /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp -O2 -std=c++11 -c -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out
+ perf record -e cycles:u -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data -- /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp -O2 -std=c++11 -c -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out
Lowering default frequency rate from 4000 to 2000.
Please consider tweaking /proc/sys/kernel/perf_event_max_sample_rate.
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.411 MB /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data (10564 samples) ]
RUN: at line 13: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/perf2bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang -p /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.fdata -nl    |& /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/clang-nolbr.test -check-prefix=CHECK-P2B
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/perf2bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang -p /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.fdata -nl
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/clang-nolbr.test -check-prefix=CHECK-P2B
RUN: at line 17: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/llvm-bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp -data /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.fdata     -relocs -reorder-blocks=ext-tsp -split-functions=3 -split-all-cold     -split-eh -icf=1 -reorder-functions=hfsort+ -use-gnu-stack     -jump-tables=move -frame-opt=hot -peepholes=all -dyno-stats    |& /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/clang-nolbr.test -check-prefix=CHECK-BOLT
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/clang-nolbr.test -check-prefix=CHECK-BOLT
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/llvm-bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp -data /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.fdata -relocs -reorder-blocks=ext-tsp -split-functions=3 -split-all-cold -split-eh -icf=1 -reorder-functions=hfsort+ -use-gnu-stack -jump-tables=move -frame-opt=hot -peepholes=all -dyno-stats
RUN: at line 24: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp -O2 -std=c++11 -c -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp -O2 -std=c++11 -c -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm3sys15PrintStackTraceERNS_11raw_ostreamE+0x1d)[0xb91357]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm3sys17RunSignalHandlersEv+0x2f)[0xb90173]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0xb903f1]
/lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x7f4bd6c42520]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x374e5d4]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x36850b5]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x3740d34]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x349a7d3]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm13FPPassManager13runOnFunctionERNS_8FunctionE+0x367)[0x3415ca7]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm13FPPassManager11runOnModuleERNS_6ModuleE+0x33)[0x1ae4143]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm6legacy15PassManagerImpl3runERNS_6ModuleE+0x185)[0x2219375]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang17EmitBackendOutputERNS_17DiagnosticsEngineERKNS_19HeaderSearchOptionsERKNS_14CodeGenOptionsERKNS_13TargetOptionsERKNS_11LangOptionsERKN4llvm10DataLayoutEPNSE_6ModuleENS_13BackendActionESt10unique_ptrINSE_17raw_pwrite_streamESt14default_deleteISM_EE+0x825)[0x22a3455]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x2385f1e]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang8ParseASTERNS_4SemaEbb+0x140)[0x1e933b0]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang13CodeGenAction13ExecuteActionEv+0x36)[0x2385986]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang14FrontendAction7ExecuteEv+0x24)[0x2337034]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang16CompilerInstance13ExecuteActionERNS_14FrontendActionE+0x120)[0x231c0e0]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang25ExecuteCompilerInvocationEPNS_16CompilerInstanceE+0x1a7)[0x23433a7]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_Z8cc1_mainN4llvm8ArrayRefIPKcEES2_Pv+0x415)[0x2173c15]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(main+0x4da)[0x1768eea]
/lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x7f4bd6c29d90]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x7f4bd6c29e40]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x2172d29]
Stack dump:
0.	Program arguments: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name bf.cpp -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -momit-leaf-frame-pointer -dwarf-column-info -debugger-tuning=gdb -coverage-notes-file /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.gcno -resource-dir /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/lib/clang/6.0.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/x86_64-linux-gnu/c++/11 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/x86_64-linux-gnu/c++/11 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/backward -internal-isystem /usr/local/include -internal-isystem /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86 -ferror-limit 19 -fmessage-length 0 -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out -x c++ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp 
1.	<eof> parser at end of file
2.	Code generation
...
Step 14 (nfc-stat-check) failure: test (failure)
./tools/bolt/test/AArch64/long-jmp-one-stub.s/.bolt -84.56% +0.09%
Geomean -14.80% -0.02%

broxigarchen changed the title ~~true16 for v_max3/min3_num_f16 GFX12~~ [AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 Jan 2, 2025

true16 for v_max3/min3_num_f16 GFX12

a31d29a

broxigarchen force-pushed the fix-min3-f16 branch from 0e3d476 to a31d29a Compare January 2, 2025 18:34

broxigarchen marked this pull request as ready for review January 2, 2025 18:37

llvmbot added backend:AMDGPU mc Machine (object) code labels Jan 2, 2025

broxigarchen requested review from jayfoad, dstutt and kosarev January 2, 2025 20:09

arsenm approved these changes Jan 3, 2025

View reviewed changes

dstutt merged commit 8b23ebb into llvm:main Jan 3, 2025
12 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 #121510

[AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 #121510

Uh oh!

broxigarchen commented Jan 2, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jan 2, 2025

Uh oh!

llvmbot commented Jan 2, 2025

Uh oh!

Uh oh!

llvm-ci commented Jan 3, 2025

Uh oh!

Uh oh!

[AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 #121510

[AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 #121510

Uh oh!

Conversation

broxigarchen commented Jan 2, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 2, 2025

Uh oh!

llvmbot commented Jan 2, 2025

Uh oh!

Uh oh!

llvm-ci commented Jan 3, 2025

Uh oh!

Uh oh!

broxigarchen commented Jan 2, 2025 •

edited

Loading