-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16][MC] true16 for v_minmax/maxmin_f16 and v_minmax/maxmin_num_f16 #120617
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][MC] true16 for v_minmax/maxmin_f16 and v_minmax/maxmin_num_f16 #120617
Conversation
7aa8756
to
d748d7e
Compare
d748d7e
to
b6bf0a1
Compare
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesTrue16 support for v_minmax/maxmin_f16(GFX11) and v_minmax/maxmin_num_f16(GFX12). These insts are updated at the same time since we are replacing the Added a GFX12 runline in minmax.ll in fake16 flow test Patch is 316.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/120617.diff 17 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b1f93a447a7b82..532df39e82a75a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3708,12 +3708,15 @@ def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
-def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
-def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
-def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
-def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
+let True16Predicate = UseFakeTrue16Insts in {
+def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
}
let OtherPredicates = [isGFX9Plus] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e457674c07a1..576efc22b64ece 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1374,8 +1374,8 @@ class VOP3_DOT_Profile_fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR
let SubtargetPredicate = isGFX11Plus in {
defm V_MAXMIN_F32 : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
defm V_MINMAX_F32 : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
- defm V_MAXMIN_F16 : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
- defm V_MINMAX_F16 : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+ defm V_MAXMIN_F16 : VOP3Inst_t16<"v_maxmin_f16", VOP_F16_F16_F16_F16>;
+ defm V_MINMAX_F16 : VOP3Inst_t16<"v_minmax_f16", VOP_F16_F16_F16_F16>;
defm V_MAXMIN_U32 : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
defm V_MINMAX_U32 : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
defm V_MAXMIN_I32 : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -1588,8 +1588,8 @@ defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32",
defm V_MED3_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">;
-defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">;
-defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">;
+defm V_MINMAX_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">;
+defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">;
defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
@@ -1730,8 +1730,8 @@ defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25b>;
defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25c>;
defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>;
defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>;
-defm V_MAXMIN_F16 : VOP3_Realtriple_gfx11<0x260>;
-defm V_MINMAX_F16 : VOP3_Realtriple_gfx11<0x261>;
+defm V_MAXMIN_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x260, "v_maxmin_f16">;
+defm V_MINMAX_F16 : VOP3_Realtriple_t16_and_fake16_gfx11<0x261, "v_minmax_f16">;
defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11_gfx12<0x262>;
defm V_MINMAX_U32 : VOP3_Realtriple_gfx11_gfx12<0x263>;
defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11_gfx12<0x264>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index d236907b0eec52..930ed9a5e2d0b3 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1909,8 +1909,8 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName
multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME,
string pseudo_mnemonic = "", bit isSingle = 0> {
- defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>;
- defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>;
+ defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+ defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
}
multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 73f3d4c037ad51..e0e26dd63bc5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL %s
define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_minmax_i32:
@@ -8,6 +10,16 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
%sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
ret i32 %sminmax
@@ -45,6 +57,16 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_maxmin_i32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
%sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
ret i32 %sminmax
@@ -56,6 +78,16 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
%smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
ret i32 %smaxmin
@@ -67,6 +99,16 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_minmax_i32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
%smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
ret i32 %smaxmin
@@ -79,6 +121,17 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
; GFX11-NEXT: v_med3_i32 v2, v2, v3, v4
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_smed3_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_med3_i32 v2, v2, v3, v4
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
%tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
%tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
@@ -93,6 +146,16 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_u32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
%uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
ret i32 %uminmax
@@ -130,6 +193,16 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_maxmin_u32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_u32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
%uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
ret i32 %uminmax
@@ -141,6 +214,16 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_u32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
%umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
ret i32 %umaxmin
@@ -152,6 +235,16 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_minmax_u32 v0, v0, v1, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_u32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
%umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
ret i32 %umaxmin
@@ -164,6 +257,17 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
; GFX11-NEXT: v_med3_u32 v2, v2, v3, v4
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_umed3_i32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_med3_u32 v2, v2, v3, v4
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
%tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
%tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
@@ -173,44 +277,12 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
}
define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_minmax_f32_ieee_true:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT: v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT: v_maxmin_f32 v0, v0, v1, v2
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_minmax_f32_ieee_true:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT: v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT: v_maxmin_f32 v0, v0, v1, v2
-; GISEL-NEXT: s_setpc_b64 s[30:31]
%max = call float @llvm.maxnum.f32(float %a, float %b)
%minmax = call float @llvm.minnum.f32(float %max, float %c)
ret float %minmax
}
define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f32_ieee_false:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT: s_mov_b32 s5, s4
-; SDAG-NEXT: s_mov_b32 s4, s3
-; SDAG-NEXT: v_maxmin_f32 v0, s0, s1, v0
-; SDAG-NEXT: global_store_b32 v1, v0, s[4:5]
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_test_minmax_f32_ieee_false:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT: s_mov_b32 s6, s3
-; GISEL-NEXT: s_mov_b32 s7, s4
-; GISEL-NEXT: v_maxmin_f32 v0, s0, s1, v0
-; GISEL-NEXT: global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT: s_endpgm
%smax = call float @llvm.maxnum.f32(float %a, float %b)
%sminmax = call float @llvm.minnum.f32(float %smax, float %c)
store float %sminmax, ptr addrspace(1) %out
@@ -222,27 +294,17 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b,
; GFX11: ; %bb.0:
; GFX11-NEXT: v_maxmin_f32 v0, v0, v1, v2
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_commuted_f32_ieee_false:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float %a, float %b)
%minmax = call float @llvm.minnum.f32(float %c, float %max)
ret float %minmax
}
define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_maxmin_f32_ieee_true:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT: v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT: v_minmax_f32 v0, v0, v1, v2
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_maxmin_f32_ieee_true:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT: v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT: v_minmax_f32 v0, v0, v1, v2
-; GISEL-NEXT: s_setpc_b64 s[30:31]
%min = call float @llvm.minnum.f32(float %a, float %b)
%maxmin = call float @llvm.maxnum.f32(float %min, float %c)
ret float %maxmin
@@ -253,6 +315,11 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
; GFX11: ; %bb.0:
; GFX11-NEXT: v_minmax_f32 v0, v0, v1, v2
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_commuted_f32_ieee_false:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2
+; GFX12-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float %a, float %b)
%maxmin = call float @llvm.maxnum.f32(float %c, float %min)
ret float %maxmin
@@ -265,6 +332,17 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
; GFX11-NEXT: v_med3_f32 v2, v2, v3, v4
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = call float @llvm.minnum.f32(float %x, float %y)
%tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
@@ -278,29 +356,17 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_maxmin_f16 v0, v0, v1, v2
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_f16_ieee_false:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; GFX12-NEXT: ; return to shader part epilog
%max = call half @llvm.maxnum.f16(half %a, half %b)
%minmax = call half @llvm.minnum.f16(half %max, half %c)
ret half %minmax
}
define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT: s_mov_b32 s5, s4
-; SDAG-NEXT: s_mov_b32 s4, s3
-; SDAG-NEXT: v_maxmin_f16 v0, s0, s1, v0
-; SDAG-NEXT: global_store_b16 v1, v0, s[4:5]
-; SDAG-NEXT: s_endpgm
-;
-; GISEL-LABEL: s_test_minmax_f16_ieee_false:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT: s_mov_b32 s6, s3
-; GISEL-NEXT: s_mov_b32 s7, s4
-; GISEL-NEXT: v_maxmin_f16 v0, s0, s1, v0
-; GISEL-NEXT: global_store_b16 v1, v0, s[6:7]
-; GISEL-NEXT: s_endpgm
%smax = call half @llvm.maxnum.f16(half %a, half %b)
%sminmax = call half @llvm.minnum.f16(half %smax, half %c)
store half %sminmax, ptr addrspace(1) %out
@@ -308,23 +374,6 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
}
define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT: v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT: v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT: v_maxmin_f16 v0, v0, v1, v2
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT: v_max_f16_e32 v1, v1, v1
-; GISEL...
[truncated]
|
b6bf0a1
to
56c834c
Compare
def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; | ||
def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; | ||
def : FPMinCanonMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; | ||
def : FPMinCanonMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not this patch's fault, but there's no reason these Pats need a separate parameter for the one-use variant. The final use can always wrap the input node with the hasOneUse predicate
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Matt. Do you want me to create another patch to address this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure
True16 support for v_minmax/maxmin_f16(GFX11) and v_minmax/maxmin_num_f16(GFX12).
These insts are updated at the same time since we are replacing the
v_minmax/maxmin_f16
tov_minmax/maxmin_fake16_f16
whilev_minmax/maxmin_num_f16
are alias insts and share the same CodeGen pattern.Added a GFX12 runline in minmax.ll in fake16 flow