Skip to content

AMDGPU: Form v2f16 minimum3/maximum3 on gfx950 #128123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 21, 2025

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Feb 21, 2025

No description provided.

Copy link
Contributor Author

arsenm commented Feb 21, 2025

@llvmbot
Copy link
Member

llvmbot commented Feb 21, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Patch is 47.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128123.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+2-1)
  • (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+54-121)
  • (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+54-121)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0b13a53a0c989..6ed09253c51e1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13515,7 +13515,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
     return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
-           (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
+           (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
+           (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index d5c6e8af109f4..85c047167f1e1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -145,8 +145,8 @@ def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
 } // End SubtargetPredicate = HasVOP3PInsts
 
 let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in {
-defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
-defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
+defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>;
+defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>;
 }
 
 // TODO: Make sure we're doing the right thing with denormals. Note
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index f228824ff750e..2a372dffce650 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1269,9 +1269,7 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
 ; GFX950-LABEL: v_fmaximum3_f16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1306,9 +1304,7 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
 ; GFX950-LABEL: v_fmaximum3_f16_commute:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v2, v0, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %c, half %max0)
@@ -1346,10 +1342,9 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
 ;
 ; GFX950-LABEL: s_fmaximum3_f16:
 ; GFX950:       ; %bb.0:
-; GFX950-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s1, s1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s2, s2
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_mov_b32_e32 v1, s2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, s0, v0, v1
 ; GFX950-NEXT:    s_nop 0
 ; GFX950-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX950-NEXT:    s_nop 0
@@ -1392,9 +1387,7 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
@@ -1431,9 +1424,7 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
@@ -1470,9 +1461,7 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call half @llvm.fabs.f16(half %c)
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -1511,9 +1500,7 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
 ; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX950-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
@@ -1554,9 +1541,7 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
 ; GFX950-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX950-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX950-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %b.fneg = fneg half %b
@@ -1597,9 +1582,7 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 ; GFX950-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX950-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX950-NEXT:    v_or_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
@@ -1641,9 +1624,7 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b)
@@ -1680,9 +1661,7 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg half %b
   %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg)
@@ -1719,9 +1698,7 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg half %c
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -1758,9 +1735,7 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_movk_i32 s0, 0x4800
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, s0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half 8.0, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1795,9 +1770,8 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
 ; GFX950-LABEL: v_fmaximum3_f16__const2:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
 ; GFX950-NEXT:    s_movk_i32 s0, 0x4800
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, s0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half 8.0)
@@ -1832,9 +1806,7 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
 ; GFX950-LABEL: v_fmaximum3_f16_inlineimm0:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 4.0, 4.0
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 4.0, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half 4.0, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1869,9 +1841,7 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
 ; GFX950-LABEL: v_fmaximum3_f16__inlineimm:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, 4.0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half 4.0)
@@ -1909,9 +1879,8 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    s_movk_i32 s0, 0x4800
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, s0
-; GFX950-NEXT:    s_movk_i32 s0, 0x4c00
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT:    v_mov_b32_e32 v1, 0x4c00
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, s0, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half 8.0)
   %max1 = call half @llvm.maximum.f16(half %max0, half 16.0)
@@ -1959,9 +1928,7 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
 ; GFX950-LABEL: v_fmaximum3_v2f16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v2, v0, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0)
@@ -2009,9 +1976,7 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
 ; GFX950-LABEL: v_fmaximum3_v2f16_commute:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -2067,9 +2032,7 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX950-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
 ; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
   %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
@@ -2120,9 +2083,7 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX950-LABEL: v_fmaximum3_v2f16__fneg_all:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v2 neg_lo:[1,1,1] neg_hi:[1,1,1]
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x half> %a
   %b.fneg = fneg <2 x half> %b
@@ -2173,9 +2134,7 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
 ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -2223,9 +2182,7 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
 ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm2:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, 4.0 op_sel_hi:[1,1,0]
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
@@ -2287,10 +2244,8 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
 ; GFX950-LABEL: v_fmaximum3_v3f16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v5, v1, v1
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v4, v0, v0
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v4, v0, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v1, v5, v1, v3
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0)
@@ -2352,10 +2307,8 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
 ; GFX950-LABEL: v_fmaximum3_v3f16_commute:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v5, v5
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v4, v4
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v4
+; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v5
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -2428,16 +2381,14 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX950-LABEL: v_fmaximum3_v3f16__fabs_all:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX950-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
-; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX950-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX950-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX950-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX950-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
 ; GFX950-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v5, v5
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v4, v4
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v4
+; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v5
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
   %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
@@ -2502,10 +2453,8 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX950-LABEL: v_fmaximum3_v3f16__fneg_all:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1]
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x half> %a
   %b.fneg = fneg <3 x half> %b
@@ -2567,10 +2516,8 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
 ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm1:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, 2.0, 2.0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, 2.0, v3
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -2632,10 +2579,8 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
 ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm2:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, 4.0, 4.0
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0]
+; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, 4.0
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
@@ -2703,10 +2648,8 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; GFX950-LABEL: v_fmaximum3_v4f16:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v5, v1, v1
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v4, v0, v...
[truncated]

@arsenm arsenm added this to the LLVM 20.X Release milestone Feb 21, 2025
@arsenm
Copy link
Contributor Author

arsenm commented Feb 21, 2025

Missed in initial gfx950 upstreaming

Copy link
Contributor Author

arsenm commented Feb 21, 2025

Merge activity

  • Feb 21, 12:02 AM EST: A user started a stack merge that includes this pull request via Graphite.
  • Feb 21, 12:09 AM EST: Graphite rebased this pull request as part of a merge.
  • Feb 21, 12:11 AM EST: A user merged this pull request with Graphite.

@arsenm arsenm force-pushed the users/arsenm/gfx950/widen-f16-fminimum-fmaximum branch 2 times, most recently from 8b17104 to e77fd5b Compare February 21, 2025 05:05
Base automatically changed from users/arsenm/gfx950/widen-f16-fminimum-fmaximum to main February 21, 2025 05:08
@arsenm arsenm force-pushed the users/arsenm/gfx950/form-v2f16-minimum3-maximum3 branch from da446d3 to 3263aaf Compare February 21, 2025 05:09
@arsenm arsenm merged commit cc46d00 into main Feb 21, 2025
6 of 10 checks passed
@arsenm arsenm deleted the users/arsenm/gfx950/form-v2f16-minimum3-maximum3 branch February 21, 2025 05:11
@llvm-ci
Copy link
Collaborator

llvm-ci commented Feb 21, 2025

LLVM Buildbot has detected a new failure on builder clangd-ubuntu-tsan running on clangd-ubuntu-clang while building llvm at step 6 "test-build-clangd-clangd-index-server-clangd-in...".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/134/builds/13798

Here is the relevant piece of the build log for the reference
Step 6 (test-build-clangd-clangd-index-server-clangd-in...) failure: test (failure)
******************** TEST 'Clangd :: target_info.test' FAILED ********************
Exit Code: 66

Command Output (stderr):
--
RUN: at line 5: rm -rf /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir && mkdir -p /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
+ rm -rf /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
+ mkdir -p /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
RUN: at line 7: echo '[{"directory": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir", "command": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/armv7-clang -x c++ the-file.cpp -v", "file": "the-file.cpp"}]' > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/compile_commands.json
+ echo '[{"directory": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir", "command": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/armv7-clang -x c++ the-file.cpp -v", "file": "the-file.cpp"}]'
RUN: at line 9: sed -e "s|INPUT_DIR|/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir|g" /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/llvm-project/clang-tools-extra/clangd/test/target_info.test > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1
+ sed -e 's|INPUT_DIR|/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/llvm-project/clang-tools-extra/clangd/test/target_info.test
RUN: at line 12: sed -E -e 's|"file://([A-Z]):/|"file:///\1:/|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1 > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test
+ sed -E -e 's|"file://([A-Z]):/|"file:///\1:/|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1
RUN: at line 14: clangd -lit-test < /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test 2>&1 | /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/bin/FileCheck -strict-whitespace /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test
+ clangd -lit-test
+ /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/bin/FileCheck -strict-whitespace /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test

--

********************


searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Mar 13, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
Development

Successfully merging this pull request may close these issues.

4 participants