Skip to content

Commit 8725b24

Browse files
committed
[AArch64] Legalize horizontal fmax/fmin reductions on f16 vectors
Expand the horizontal reduction during the instruction selection phase, but only if the target doesn't support the full fp16 instruction set. Fixes https://bugs.llvm.org/show_bug.cgi?id=49401 Reviewed By: aemerson Differential Revision: https://reviews.llvm.org/D97840
1 parent 5fedf30 commit 8725b24

File tree

3 files changed

+126
-5
lines changed

3 files changed

+126
-5
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1016,11 +1016,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
10161016
// Vector reductions
10171017
for (MVT VT : { MVT::v4f16, MVT::v2f32,
10181018
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1019-
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1020-
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1019+
if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1020+
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1021+
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
10211022

1022-
if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())
10231023
setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1024+
}
10241025
}
10251026
for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
10261027
MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
2+
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
3+
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
34

45
declare half @llvm.vector.reduce.fmax.v1f16(<1 x half> %a)
56
declare float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
67
declare double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
78
declare fp128 @llvm.vector.reduce.fmax.v1f128(<1 x fp128> %a)
89

10+
declare half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
911
declare float @llvm.vector.reduce.fmax.v3f32(<3 x float> %a)
1012
declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
1113
declare float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a)
@@ -44,6 +46,64 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
4446
ret fp128 %b
4547
}
4648

49+
define half @test_v4f16(<4 x half> %a) nounwind {
50+
; CHECK-NOFP-LABEL: test_v4f16:
51+
; CHECK-NOFP: // %bb.0:
52+
; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0
53+
; CHECK-NOFP-NEXT: mov h3, v0.h[1]
54+
; CHECK-NOFP-NEXT: mov h1, v0.h[3]
55+
; CHECK-NOFP-NEXT: mov h2, v0.h[2]
56+
; CHECK-NOFP-NEXT: fcvt s0, h0
57+
; CHECK-NOFP-NEXT: fcvt s3, h3
58+
; CHECK-NOFP-NEXT: fmaxnm s0, s0, s3
59+
; CHECK-NOFP-NEXT: fcvt h0, s0
60+
; CHECK-NOFP-NEXT: fcvt s2, h2
61+
; CHECK-NOFP-NEXT: fcvt s0, h0
62+
; CHECK-NOFP-NEXT: fmaxnm s0, s0, s2
63+
; CHECK-NOFP-NEXT: fcvt h0, s0
64+
; CHECK-NOFP-NEXT: fcvt s0, h0
65+
; CHECK-NOFP-NEXT: fcvt s1, h1
66+
; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1
67+
; CHECK-NOFP-NEXT: fcvt h0, s0
68+
; CHECK-NOFP-NEXT: ret
69+
;
70+
; CHECK-FP-LABEL: test_v4f16:
71+
; CHECK-FP: // %bb.0:
72+
; CHECK-FP-NEXT: fmaxnmv h0, v0.4h
73+
; CHECK-FP-NEXT: ret
74+
%b = call nnan half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
75+
ret half %b
76+
}
77+
78+
define half @test_v4f16_ninf(<4 x half> %a) nounwind {
79+
; CHECK-NOFP-LABEL: test_v4f16_ninf:
80+
; CHECK-NOFP: // %bb.0:
81+
; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0
82+
; CHECK-NOFP-NEXT: mov h3, v0.h[1]
83+
; CHECK-NOFP-NEXT: mov h1, v0.h[3]
84+
; CHECK-NOFP-NEXT: mov h2, v0.h[2]
85+
; CHECK-NOFP-NEXT: fcvt s0, h0
86+
; CHECK-NOFP-NEXT: fcvt s3, h3
87+
; CHECK-NOFP-NEXT: fmaxnm s0, s0, s3
88+
; CHECK-NOFP-NEXT: fcvt h0, s0
89+
; CHECK-NOFP-NEXT: fcvt s2, h2
90+
; CHECK-NOFP-NEXT: fcvt s0, h0
91+
; CHECK-NOFP-NEXT: fmaxnm s0, s0, s2
92+
; CHECK-NOFP-NEXT: fcvt h0, s0
93+
; CHECK-NOFP-NEXT: fcvt s0, h0
94+
; CHECK-NOFP-NEXT: fcvt s1, h1
95+
; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1
96+
; CHECK-NOFP-NEXT: fcvt h0, s0
97+
; CHECK-NOFP-NEXT: ret
98+
;
99+
; CHECK-FP-LABEL: test_v4f16_ninf:
100+
; CHECK-FP: // %bb.0:
101+
; CHECK-FP-NEXT: fmaxnmv h0, v0.4h
102+
; CHECK-FP-NEXT: ret
103+
%b = call nnan ninf half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
104+
ret half %b
105+
}
106+
47107
define float @test_v3f32(<3 x float> %a) nounwind {
48108
; CHECK-LABEL: test_v3f32:
49109
; CHECK: // %bb.0:

llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
2+
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
3+
; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
34

45
declare half @llvm.vector.reduce.fmin.v1f16(<1 x half> %a)
56
declare float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
67
declare double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
78
declare fp128 @llvm.vector.reduce.fmin.v1f128(<1 x fp128> %a)
89

10+
declare half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
911
declare float @llvm.vector.reduce.fmin.v3f32(<3 x float> %a)
1012
declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
1113
declare float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a)
@@ -44,6 +46,64 @@ define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
4446
ret fp128 %b
4547
}
4648

49+
define half @test_v4f16(<4 x half> %a) nounwind {
50+
; CHECK-NOFP-LABEL: test_v4f16:
51+
; CHECK-NOFP: // %bb.0:
52+
; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0
53+
; CHECK-NOFP-NEXT: mov h3, v0.h[1]
54+
; CHECK-NOFP-NEXT: mov h1, v0.h[3]
55+
; CHECK-NOFP-NEXT: mov h2, v0.h[2]
56+
; CHECK-NOFP-NEXT: fcvt s0, h0
57+
; CHECK-NOFP-NEXT: fcvt s3, h3
58+
; CHECK-NOFP-NEXT: fminnm s0, s0, s3
59+
; CHECK-NOFP-NEXT: fcvt h0, s0
60+
; CHECK-NOFP-NEXT: fcvt s2, h2
61+
; CHECK-NOFP-NEXT: fcvt s0, h0
62+
; CHECK-NOFP-NEXT: fminnm s0, s0, s2
63+
; CHECK-NOFP-NEXT: fcvt h0, s0
64+
; CHECK-NOFP-NEXT: fcvt s0, h0
65+
; CHECK-NOFP-NEXT: fcvt s1, h1
66+
; CHECK-NOFP-NEXT: fminnm s0, s0, s1
67+
; CHECK-NOFP-NEXT: fcvt h0, s0
68+
; CHECK-NOFP-NEXT: ret
69+
;
70+
; CHECK-FP-LABEL: test_v4f16:
71+
; CHECK-FP: // %bb.0:
72+
; CHECK-FP-NEXT: fminnmv h0, v0.4h
73+
; CHECK-FP-NEXT: ret
74+
%b = call nnan half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
75+
ret half %b
76+
}
77+
78+
define half @test_v4f16_ninf(<4 x half> %a) nounwind {
79+
; CHECK-NOFP-LABEL: test_v4f16_ninf:
80+
; CHECK-NOFP: // %bb.0:
81+
; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0
82+
; CHECK-NOFP-NEXT: mov h3, v0.h[1]
83+
; CHECK-NOFP-NEXT: mov h1, v0.h[3]
84+
; CHECK-NOFP-NEXT: mov h2, v0.h[2]
85+
; CHECK-NOFP-NEXT: fcvt s0, h0
86+
; CHECK-NOFP-NEXT: fcvt s3, h3
87+
; CHECK-NOFP-NEXT: fminnm s0, s0, s3
88+
; CHECK-NOFP-NEXT: fcvt h0, s0
89+
; CHECK-NOFP-NEXT: fcvt s2, h2
90+
; CHECK-NOFP-NEXT: fcvt s0, h0
91+
; CHECK-NOFP-NEXT: fminnm s0, s0, s2
92+
; CHECK-NOFP-NEXT: fcvt h0, s0
93+
; CHECK-NOFP-NEXT: fcvt s0, h0
94+
; CHECK-NOFP-NEXT: fcvt s1, h1
95+
; CHECK-NOFP-NEXT: fminnm s0, s0, s1
96+
; CHECK-NOFP-NEXT: fcvt h0, s0
97+
; CHECK-NOFP-NEXT: ret
98+
;
99+
; CHECK-FP-LABEL: test_v4f16_ninf:
100+
; CHECK-FP: // %bb.0:
101+
; CHECK-FP-NEXT: fminnmv h0, v0.4h
102+
; CHECK-FP-NEXT: ret
103+
%b = call nnan ninf half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
104+
ret half %b
105+
}
106+
47107
define float @test_v3f32(<3 x float> %a) nounwind {
48108
; CHECK-LABEL: test_v3f32:
49109
; CHECK: // %bb.0:

0 commit comments

Comments
 (0)