Skip to content

Commit 2a3903f

Browse files
authored
[AMDGPU] Prevent FMINIMUM and FMAXIMUM beeing fully scalarized (#91378)
This is the same logic as with FMINNUM_IEEE/FMAXNUM_IEEE.
1 parent bc8a427 commit 2a3903f

File tree

5 files changed

+63
-39
lines changed

5 files changed

+63
-39
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,9 +854,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
854854
if (Subtarget->hasPrefetch())
855855
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
856856

857-
if (Subtarget->hasIEEEMinMax())
857+
if (Subtarget->hasIEEEMinMax()) {
858858
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
859859
{MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
860+
setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
861+
{MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
862+
Custom);
863+
}
860864

861865
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
862866
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
@@ -5821,6 +5825,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
58215825
case ISD::FMUL:
58225826
case ISD::FMINNUM_IEEE:
58235827
case ISD::FMAXNUM_IEEE:
5828+
case ISD::FMINIMUM:
5829+
case ISD::FMAXIMUM:
58245830
case ISD::UADDSAT:
58255831
case ISD::USUBSAT:
58265832
case ISD::SADDSAT:

llvm/test/CodeGen/AMDGPU/fmaximum.ll

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x ha
148148
}
149149

150150
define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
151-
; GCN-LABEL: test_fmaximum_v3f16_vv:
152-
; GCN: ; %bb.0:
153-
; GCN-NEXT: v_pk_maximum_f16 v0, v0, v2
154-
; GCN-NEXT: v_maximum_f16 v1, v1, v3
155-
; GCN-NEXT: ; return to shader part epilog
151+
; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
152+
; GFX12-SDAG: ; %bb.0:
153+
; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, v0, v2
154+
; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, v1, v3
155+
; GFX12-SDAG-NEXT: ; return to shader part epilog
156+
;
157+
; GFX12-GISEL-LABEL: test_fmaximum_v3f16_vv:
158+
; GFX12-GISEL: ; %bb.0:
159+
; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, v0, v2
160+
; GFX12-GISEL-NEXT: v_maximum_f16 v1, v1, v3
161+
; GFX12-GISEL-NEXT: ; return to shader part epilog
156162
%val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
157163
ret <3 x half> %val
158164
}
159165

160166
define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
161-
; GCN-LABEL: test_fmaximum_v3f16_ss:
162-
; GCN: ; %bb.0:
163-
; GCN-NEXT: v_pk_maximum_f16 v0, s0, s2
164-
; GCN-NEXT: s_maximum_f16 s0, s1, s3
165-
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
166-
; GCN-NEXT: v_mov_b32_e32 v1, s0
167-
; GCN-NEXT: ; return to shader part epilog
167+
; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
168+
; GFX12-SDAG: ; %bb.0:
169+
; GFX12-SDAG-NEXT: v_pk_maximum_f16 v0, s0, s2
170+
; GFX12-SDAG-NEXT: v_pk_maximum_f16 v1, s1, s3
171+
; GFX12-SDAG-NEXT: ; return to shader part epilog
172+
;
173+
; GFX12-GISEL-LABEL: test_fmaximum_v3f16_ss:
174+
; GFX12-GISEL: ; %bb.0:
175+
; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, s0, s2
176+
; GFX12-GISEL-NEXT: s_maximum_f16 s0, s1, s3
177+
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
178+
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
179+
; GFX12-GISEL-NEXT: ; return to shader part epilog
168180
%val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
169181
ret <3 x half> %val
170182
}
@@ -306,6 +318,3 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
306318
declare double @llvm.maximum.f64(double, double)
307319
declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
308320
declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
309-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
310-
; GFX12-GISEL: {{.*}}
311-
; GFX12-SDAG: {{.*}}

llvm/test/CodeGen/AMDGPU/fminimum.ll

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -148,23 +148,35 @@ define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x ha
148148
}
149149

150150
define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
151-
; GCN-LABEL: test_fminimum_v3f16_vv:
152-
; GCN: ; %bb.0:
153-
; GCN-NEXT: v_pk_minimum_f16 v0, v0, v2
154-
; GCN-NEXT: v_minimum_f16 v1, v1, v3
155-
; GCN-NEXT: ; return to shader part epilog
151+
; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
152+
; GFX12-SDAG: ; %bb.0:
153+
; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, v0, v2
154+
; GFX12-SDAG-NEXT: v_pk_minimum_f16 v1, v1, v3
155+
; GFX12-SDAG-NEXT: ; return to shader part epilog
156+
;
157+
; GFX12-GISEL-LABEL: test_fminimum_v3f16_vv:
158+
; GFX12-GISEL: ; %bb.0:
159+
; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, v0, v2
160+
; GFX12-GISEL-NEXT: v_minimum_f16 v1, v1, v3
161+
; GFX12-GISEL-NEXT: ; return to shader part epilog
156162
%val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
157163
ret <3 x half> %val
158164
}
159165

160166
define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
161-
; GCN-LABEL: test_fminimum_v3f16_ss:
162-
; GCN: ; %bb.0:
163-
; GCN-NEXT: v_pk_minimum_f16 v0, s0, s2
164-
; GCN-NEXT: s_minimum_f16 s0, s1, s3
165-
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
166-
; GCN-NEXT: v_mov_b32_e32 v1, s0
167-
; GCN-NEXT: ; return to shader part epilog
167+
; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
168+
; GFX12-SDAG: ; %bb.0:
169+
; GFX12-SDAG-NEXT: v_pk_minimum_f16 v0, s0, s2
170+
; GFX12-SDAG-NEXT: v_pk_minimum_f16 v1, s1, s3
171+
; GFX12-SDAG-NEXT: ; return to shader part epilog
172+
;
173+
; GFX12-GISEL-LABEL: test_fminimum_v3f16_ss:
174+
; GFX12-GISEL: ; %bb.0:
175+
; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, s0, s2
176+
; GFX12-GISEL-NEXT: s_minimum_f16 s0, s1, s3
177+
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
178+
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
179+
; GFX12-GISEL-NEXT: ; return to shader part epilog
168180
%val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
169181
ret <3 x half> %val
170182
}
@@ -306,6 +318,3 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
306318
declare double @llvm.minimum.f64(double, double)
307319
declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
308320
declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
309-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
310-
; GFX12-GISEL: {{.*}}
311-
; GFX12-SDAG: {{.*}}

llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1794,7 +1794,7 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
17941794
; GFX12-NEXT: s_wait_bvhcnt 0x0
17951795
; GFX12-NEXT: s_wait_kmcnt 0x0
17961796
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
1797-
; GFX12-NEXT: v_maximum_f16 v1, v1, v3
1797+
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
17981798
; GFX12-NEXT: s_setpc_b64 s[30:31]
17991799
%op = call <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
18001800
ret <3 x half> %op
@@ -2013,7 +2013,7 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
20132013
; GFX12-NEXT: s_wait_bvhcnt 0x0
20142014
; GFX12-NEXT: s_wait_kmcnt 0x0
20152015
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
2016-
; GFX12-NEXT: v_maximum_f16 v1, v1, v3
2016+
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
20172017
; GFX12-NEXT: s_setpc_b64 s[30:31]
20182018
%op = call nnan <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
20192019
ret <3 x half> %op
@@ -2163,7 +2163,7 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
21632163
; GFX12-NEXT: s_wait_bvhcnt 0x0
21642164
; GFX12-NEXT: s_wait_kmcnt 0x0
21652165
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
2166-
; GFX12-NEXT: v_maximum_f16 v1, v1, v3
2166+
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
21672167
; GFX12-NEXT: s_setpc_b64 s[30:31]
21682168
%op = call nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
21692169
ret <3 x half> %op
@@ -2260,7 +2260,7 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
22602260
; GFX12-NEXT: s_wait_bvhcnt 0x0
22612261
; GFX12-NEXT: s_wait_kmcnt 0x0
22622262
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
2263-
; GFX12-NEXT: v_maximum_f16 v1, v1, v3
2263+
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
22642264
; GFX12-NEXT: s_setpc_b64 s[30:31]
22652265
%op = call nnan nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
22662266
ret <3 x half> %op

llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,7 +1461,7 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
14611461
; GFX12-NEXT: s_wait_bvhcnt 0x0
14621462
; GFX12-NEXT: s_wait_kmcnt 0x0
14631463
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
1464-
; GFX12-NEXT: v_minimum_f16 v1, v1, v3
1464+
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
14651465
; GFX12-NEXT: s_setpc_b64 s[30:31]
14661466
%op = call <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
14671467
ret <3 x half> %op
@@ -1635,7 +1635,7 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
16351635
; GFX12-NEXT: s_wait_bvhcnt 0x0
16361636
; GFX12-NEXT: s_wait_kmcnt 0x0
16371637
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
1638-
; GFX12-NEXT: v_minimum_f16 v1, v1, v3
1638+
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
16391639
; GFX12-NEXT: s_setpc_b64 s[30:31]
16401640
%op = call nnan <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
16411641
ret <3 x half> %op
@@ -1740,7 +1740,7 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
17401740
; GFX12-NEXT: s_wait_bvhcnt 0x0
17411741
; GFX12-NEXT: s_wait_kmcnt 0x0
17421742
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
1743-
; GFX12-NEXT: v_minimum_f16 v1, v1, v3
1743+
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
17441744
; GFX12-NEXT: s_setpc_b64 s[30:31]
17451745
%op = call nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
17461746
ret <3 x half> %op
@@ -1792,7 +1792,7 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
17921792
; GFX12-NEXT: s_wait_bvhcnt 0x0
17931793
; GFX12-NEXT: s_wait_kmcnt 0x0
17941794
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
1795-
; GFX12-NEXT: v_minimum_f16 v1, v1, v3
1795+
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
17961796
; GFX12-NEXT: s_setpc_b64 s[30:31]
17971797
%op = call nnan nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
17981798
ret <3 x half> %op

0 commit comments

Comments
 (0)