-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Prevent FMINIMUM and FMAXIMUM beeing fully scalarized #91378
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This is the same logic as with FMINNUM_IEEE/FMAXNUM_IEEE.
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesThis is the same logic as with FMINNUM_IEEE/FMAXNUM_IEEE. Full diff: https://github.com/llvm/llvm-project/pull/91378.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ed41c10b50d323..33bdd6195a0408 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -854,9 +854,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasPrefetch())
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
- if (Subtarget->hasIEEEMinMax())
+ if (Subtarget->hasIEEEMinMax()) {
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
{MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
+ setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Custom);
+ }
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
@@ -5821,6 +5825,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case ISD::UADDSAT:
case ISD::USUBSAT:
case ISD::SADDSAT:
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index dd685a6169d843..87ac95a1cd7390 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x ha
}
define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
-; GCN-LABEL: test_fmaximum_v3f16_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GCN-NEXT: v_maximum_f16 v1, v1, v3
-; GCN-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, v0, v2
+; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, v0, v2
+; GFX12-GISEL-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
ret <3 x half> %val
}
define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v3f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_maximum_f16 v0, s0, s2
-; GCN-NEXT: s_maximum_f16 s0, s1, s3
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, s0, s2
+; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, s1, s3
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fmaximum_v3f16_ss:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, s0, s2
+; GFX12-GISEL-NEXT: s_maximum_f16 s0, s1, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
ret <3 x half> %val
}
@@ -306,6 +318,3 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
declare double @llvm.maximum.f64(double, double)
declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-GISEL: {{.*}}
-; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 2b3cc4fd73858c..45f6bff10f45ee 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x ha
}
define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
-; GCN-LABEL: test_fminimum_v3f16_vv:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GCN-NEXT: v_minimum_f16 v1, v1, v3
-; GCN-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, v0, v2
+; GFX12-SDAG-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, v0, v2
+; GFX12-GISEL-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
ret <3 x half> %val
}
define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v3f16_ss:
-; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_minimum_f16 v0, s0, s2
-; GCN-NEXT: s_minimum_f16 s0, s1, s3
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, s0, s2
+; GFX12-SDAG-NEXT: v_pk_minimum_f16 v1, s1, s3
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: test_fminimum_v3f16_ss:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, s0, s2
+; GFX12-GISEL-NEXT: s_minimum_f16 s0, s1, s3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
ret <3 x half> %val
}
@@ -306,6 +318,3 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
declare double @llvm.minimum.f64(double, double)
declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-GISEL: {{.*}}
-; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index c49e6a9a9f25cf..c476208ed8f4e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -1794,7 +1794,7 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -2013,7 +2013,7 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -2163,7 +2163,7 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -2260,7 +2260,7 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: v_maximum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 7281c3fd64d466..66f3a48b13ee58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -1461,7 +1461,7 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1635,7 +1635,7 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1740,7 +1740,7 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1792,7 +1792,7 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: v_minimum_f16 v1, v1, v3
+; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm but we're not getting the undef high element to scalar operation optimization
; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, v0, v2 | ||
; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, v1, v3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We must be missing the combine to eliminate the undef high half of the vector operation too
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, although that is a separate optimization unrelated to a specific opcode. I think what we need is a general optimization checks if a v2 operation has unused high element and if a same scalar operation is legal then splits it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we already have this but I don't immediately see where it is
This is the same logic as with FMINNUM_IEEE/FMAXNUM_IEEE.