Skip to content

[AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 #121510

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 3, 2025

Conversation

broxigarchen
Copy link
Contributor

@broxigarchen broxigarchen commented Jan 2, 2025

V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together.

This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test

@broxigarchen broxigarchen changed the title true16 for v_max3/min3_num_f16 GFX12 [AMDGPU][True16[MC] true16 for v_max3/min3_num_f16 Jan 2, 2025
@broxigarchen broxigarchen marked this pull request as ready for review January 2, 2025 18:37
@llvmbot llvmbot added backend:AMDGPU mc Machine (object) code labels Jan 2, 2025
@llvmbot
Copy link
Member

llvmbot commented Jan 2, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together.

This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test


Patch is 169.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121510.diff

10 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+134)
  • (modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+200)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+84-60)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s (+4-4)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s (+84-84)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s (+76-76)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+132-30)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt (+170-30)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt (+170-30)
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e457674c07a1..d00c810859e3b8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
 
 defm V_MIN3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
 defm V_MAX3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
-defm V_MIN3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
-defm V_MAX3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
+defm V_MIN3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">;
+defm V_MAX3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
 defm V_MINIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22d>;
 defm V_MAXIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22e>;
 defm V_MINIMUM3_F16       : VOP3Only_Realtriple_t16_gfx12<0x22f>;
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0dbbaea984..fbcdbed338e60c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile  float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_max_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e044df93..269fd52df5c49f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_min_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s6
+; GFX12-NEXT:    s_mov_b32 s13, s7
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
   %c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Jan 2, 2025

@llvm/pr-subscribers-mc

Author: Brox Chen (broxigarchen)

Changes

V_MAX3/MIN3_NUM_F16 are alias GFX12 instructions with V_MAX3/MIN3_F16 in GFX11 and they should be updated together.

This fix a bug introduced in #113603 such that only V_MAX3/MIN3_F16 are replaced in true16 format. Also added GFX12 runlines for CodeGen test


Patch is 169.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121510.diff

10 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+134)
  • (modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+200)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+84-60)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s (+4-4)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s (+84-84)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s (+76-76)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+132-30)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt (+170-30)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt (+170-30)
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e457674c07a1..d00c810859e3b8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1578,8 +1578,8 @@ def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
 
 defm V_MIN3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
 defm V_MAX3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
-defm V_MIN3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
-defm V_MAX3_NUM_F16       : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
+defm V_MIN3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_num_f16", "V_MIN3_F16", "v_min3_f16">;
+defm V_MAX3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
 defm V_MINIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22d>;
 defm V_MAXIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22e>;
 defm V_MINIMUM3_F16       : VOP3Only_Realtriple_t16_gfx12<0x22f>;
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4b3f0dbbaea984..fbcdbed338e60c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmax3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile  float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmax3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_max_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmax3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 38b712e044df93..269fd52df5c49f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmin3_olt_0_f32:
@@ -124,6 +125,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -254,6 +285,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f32 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f32 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %c = load volatile float, ptr addrspace(1) %cptr, align 4
@@ -391,6 +452,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -529,6 +620,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min3_f16 v0, v2, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_mov_b32 s22, s10
+; GFX12-NEXT:    s_mov_b32 s23, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    s_mov_b32 s20, s6
+; GFX12-NEXT:    s_mov_b32 s21, s7
+; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_min3_num_f16 v0, v2, v0, v1
+; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -594,6 +715,19 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; GFX11-NEXT:    v_pk_min_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: no_fmin3_v2f16:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v2, v0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
@@ -734,6 +868,39 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_0_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+; GFX12-NEXT:    s_mov_b32 s13, s3
+; GFX12-NEXT:    s_mov_b32 s16, s4
+; GFX12-NEXT:    s_mov_b32 s17, s5
+; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s6
+; GFX12-NEXT:    s_mov_b32 s13, s7
+; GFX12-NEXT:    s_mov_b32 s8, s0
+; GFX12-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s9, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
   %c = load volatile double, ptr addrspace(1) %cptr, align 4
@@ -877,6 +1044,39 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_fmin3_olt_1_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-NEXT:    s_mov_b32 s10, -1
+; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-NEXT:    s_mov_b32 s14, s10
+; GFX12-NEXT:    s_mov_b32 s15, s11
+; GFX12-NEXT:    s_mov_b32 s18, s10
+; GFX12-NEXT:    s_mov_b32 s19, s11
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s12, s2
+...
[truncated]

@dstutt dstutt merged commit 8b23ebb into llvm:main Jan 3, 2025
12 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Jan 3, 2025

LLVM Buildbot has detected a new failure on builder bolt-x86_64-ubuntu-nfc running on bolt-worker while building llvm at step 9 "test-build-bolt-check-large-bolt".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/92/builds/12090

Here is the relevant piece of the build log for the reference
Step 9 (test-build-bolt-check-large-bolt) failure: test (failure)
******************** TEST 'bolt-tests :: X86/clang-nolbr.test' FAILED ********************
Exit Code: 254

Command Output (stderr):
--
RUN: at line 5: mkdir -p /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output
+ mkdir -p /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output
RUN: at line 6: test -f /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang || unzstd /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/clang.zst -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang
+ test -f /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang
RUN: at line 9: perf record -e cycles:u -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data --    /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp -O2 -std=c++11 -c -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out
+ perf record -e cycles:u -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data -- /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp -O2 -std=c++11 -c -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out
Lowering default frequency rate from 4000 to 2000.
Please consider tweaking /proc/sys/kernel/perf_event_max_sample_rate.
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.411 MB /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data (10564 samples) ]
RUN: at line 13: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/perf2bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang -p /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.fdata -nl    |& /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/clang-nolbr.test -check-prefix=CHECK-P2B
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/perf2bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang -p /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.data -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.fdata -nl
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/clang-nolbr.test -check-prefix=CHECK-P2B
RUN: at line 17: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/llvm-bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp -data /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.fdata     -relocs -reorder-blocks=ext-tsp -split-functions=3 -split-all-cold     -split-eh -icf=1 -reorder-functions=hfsort+ -use-gnu-stack     -jump-tables=move -frame-opt=hot -peepholes=all -dyno-stats    |& /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/clang-nolbr.test -check-prefix=CHECK-BOLT
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/FileCheck /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/clang-nolbr.test -check-prefix=CHECK-BOLT
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/bin/llvm-bolt /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Output/clang -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp -data /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.fdata -relocs -reorder-blocks=ext-tsp -split-functions=3 -split-all-cold -split-eh -icf=1 -reorder-functions=hfsort+ -use-gnu-stack -jump-tables=move -frame-opt=hot -peepholes=all -dyno-stats
RUN: at line 24: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp -O2 -std=c++11 -c -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out
+ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp -O2 -std=c++11 -c -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm3sys15PrintStackTraceERNS_11raw_ostreamE+0x1d)[0xb91357]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm3sys17RunSignalHandlersEv+0x2f)[0xb90173]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0xb903f1]
/lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x7f4bd6c42520]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x374e5d4]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x36850b5]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x3740d34]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x349a7d3]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm13FPPassManager13runOnFunctionERNS_8FunctionE+0x367)[0x3415ca7]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm13FPPassManager11runOnModuleERNS_6ModuleE+0x33)[0x1ae4143]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN4llvm6legacy15PassManagerImpl3runERNS_6ModuleE+0x185)[0x2219375]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang17EmitBackendOutputERNS_17DiagnosticsEngineERKNS_19HeaderSearchOptionsERKNS_14CodeGenOptionsERKNS_13TargetOptionsERKNS_11LangOptionsERKN4llvm10DataLayoutEPNSE_6ModuleENS_13BackendActionESt10unique_ptrINSE_17raw_pwrite_streamESt14default_deleteISM_EE+0x825)[0x22a3455]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x2385f1e]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang8ParseASTERNS_4SemaEbb+0x140)[0x1e933b0]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang13CodeGenAction13ExecuteActionEv+0x36)[0x2385986]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang14FrontendAction7ExecuteEv+0x24)[0x2337034]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang16CompilerInstance13ExecuteActionERNS_14FrontendActionE+0x120)[0x231c0e0]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_ZN5clang25ExecuteCompilerInvocationEPNS_16CompilerInstanceE+0x1a7)[0x23433a7]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(_Z8cc1_mainN4llvm8ArrayRefIPKcEES2_Pv+0x415)[0x2173c15]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp(main+0x4da)[0x1768eea]
/lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x7f4bd6c29d90]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x7f4bd6c29e40]
/home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp[0x2172d29]
Stack dump:
0.	Program arguments: /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name bf.cpp -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -momit-leaf-frame-pointer -dwarf-column-info -debugger-tuning=gdb -coverage-notes-file /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.gcno -resource-dir /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/lib/clang/6.0.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/x86_64-linux-gnu/c++/11 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/x86_64-linux-gnu/c++/11 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/backward -internal-isystem /usr/local/include -internal-isystem /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/lib/clang/6.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86 -ferror-limit 19 -fmessage-length 0 -fobjc-runtime=gcc -fcxx-exceptions -fexceptions -fdiagnostics-show-option -vectorize-loops -vectorize-slp -o /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/build/tools/bolttests/X86/Output/clang-nolbr.test.tmp.out -x c++ /home/worker/bolt-worker2/bolt-x86_64-ubuntu-nfc/bolt-tests/test/X86/Inputs/bf.cpp 
1.	<eof> parser at end of file
2.	Code generation
...
Step 14 (nfc-stat-check) failure: test (failure)
./tools/bolt/test/AArch64/long-jmp-one-stub.s/.bolt -84.56% +0.09%
Geomean -14.80% -0.02%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants