Skip to content

Commit 900c898

Browse files
committed
[AArch64] Lower fpto*i.sat intrinsics.
AArch64's fctv* instructions implement the saturating behaviour that the fpto*i.sat intrinsics require, in cases where the destination width matches the saturation width. Lowering them removes a lot of unnecessary generated code. Only scalar lowerings are supported for now. Differential Revision: https://reviews.llvm.org/D102353
1 parent 5b614eb commit 900c898

File tree

9 files changed

+1320
-1837
lines changed

9 files changed

+1320
-1837
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
471471
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
472472
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
473473

474+
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
475+
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
476+
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
477+
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
478+
474479
// Variable arguments.
475480
setOperationAction(ISD::VASTART, MVT::Other, Custom);
476481
setOperationAction(ISD::VAARG, MVT::Other, Custom);
@@ -876,6 +881,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
876881
setTargetDAGCombine(ISD::SINT_TO_FP);
877882
setTargetDAGCombine(ISD::UINT_TO_FP);
878883

884+
// TODO: Do the same for FP_TO_*INT_SAT.
879885
setTargetDAGCombine(ISD::FP_TO_SINT);
880886
setTargetDAGCombine(ISD::FP_TO_UINT);
881887
setTargetDAGCombine(ISD::FDIV);
@@ -3292,6 +3298,44 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
32923298
return SDValue();
32933299
}
32943300

3301+
SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
3302+
SelectionDAG &DAG) const {
3303+
// AArch64 FP-to-int conversions saturate to the destination register size, so
3304+
// we can lower common saturating conversions to simple instructions.
3305+
SDValue SrcVal = Op.getOperand(0);
3306+
3307+
EVT SrcVT = SrcVal.getValueType();
3308+
EVT DstVT = Op.getValueType();
3309+
3310+
EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3311+
uint64_t SatWidth = SatVT.getScalarSizeInBits();
3312+
uint64_t DstWidth = DstVT.getScalarSizeInBits();
3313+
assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3314+
3315+
// TODO: Support lowering of NEON and SVE conversions.
3316+
if (SrcVT.isVector())
3317+
return SDValue();
3318+
3319+
// TODO: Saturate to SatWidth explicitly.
3320+
if (SatWidth != DstWidth)
3321+
return SDValue();
3322+
3323+
// In the absence of FP16 support, promote f32 to f16, like LowerFP_TO_INT().
3324+
if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16())
3325+
return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
3326+
DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal),
3327+
Op.getOperand(1));
3328+
3329+
// Cases that we can emit directly.
3330+
if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
3331+
(SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
3332+
(DstVT == MVT::i64 || DstVT == MVT::i32))
3333+
return Op;
3334+
3335+
// For all other cases, fall back on the expanded form.
3336+
return SDValue();
3337+
}
3338+
32953339
SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
32963340
SelectionDAG &DAG) const {
32973341
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
@@ -4553,6 +4597,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
45534597
case ISD::STRICT_FP_TO_SINT:
45544598
case ISD::STRICT_FP_TO_UINT:
45554599
return LowerFP_TO_INT(Op, DAG);
4600+
case ISD::FP_TO_SINT_SAT:
4601+
case ISD::FP_TO_UINT_SAT:
4602+
return LowerFP_TO_INT_SAT(Op, DAG);
45564603
case ISD::FSINCOS:
45574604
return LowerFSINCOS(Op, DAG);
45584605
case ISD::FLT_ROUNDS_:

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,7 @@ class AArch64TargetLowering : public TargetLowering {
960960
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
961961
SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
962962
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
963+
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
963964
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
964965
SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
965966
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3692,6 +3692,25 @@ defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
36923692
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
36933693
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
36943694

3695+
// AArch64's FCVT instructions saturate when out of range.
3696+
multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> {
3697+
def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
3698+
(!cast<Instruction>(INST # UWHr) f16:$Rn)>;
3699+
def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
3700+
(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
3701+
def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
3702+
(!cast<Instruction>(INST # UWDr) f64:$Rn)>;
3703+
def : Pat<(i64 (to_int_sat f16:$Rn, i64)),
3704+
(!cast<Instruction>(INST # UXHr) f16:$Rn)>;
3705+
def : Pat<(i64 (to_int_sat f32:$Rn, i64)),
3706+
(!cast<Instruction>(INST # UXSr) f32:$Rn)>;
3707+
def : Pat<(i64 (to_int_sat f64:$Rn, i64)),
3708+
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
3709+
}
3710+
3711+
defm : FPToIntegerSatPats<fp_to_sint_sat, "FCVTZS">;
3712+
defm : FPToIntegerSatPats<fp_to_uint_sat, "FCVTZU">;
3713+
36953714
multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
36963715
def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
36973716
def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
@@ -3717,7 +3736,7 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
37173736
defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
37183737
defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
37193738

3720-
multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
3739+
multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, string INST> {
37213740
def : Pat<(i32 (to_int (round f32:$Rn))),
37223741
(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
37233742
def : Pat<(i64 (to_int (round f32:$Rn))),
@@ -3726,16 +3745,32 @@ multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
37263745
(!cast<Instruction>(INST # UWDr) f64:$Rn)>;
37273746
def : Pat<(i64 (to_int (round f64:$Rn))),
37283747
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
3748+
3749+
// These instructions saturate like fp_to_[su]int_sat.
3750+
def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
3751+
(!cast<Instruction>(INST # UWHr) f16:$Rn)>;
3752+
def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)),
3753+
(!cast<Instruction>(INST # UXHr) f16:$Rn)>;
3754+
def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)),
3755+
(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
3756+
def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)),
3757+
(!cast<Instruction>(INST # UXSr) f32:$Rn)>;
3758+
def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)),
3759+
(!cast<Instruction>(INST # UWDr) f64:$Rn)>;
3760+
def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)),
3761+
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
37293762
}
37303763

3731-
defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">;
3732-
defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">;
3733-
defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
3734-
defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
3735-
defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
3736-
defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
3737-
defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
3738-
defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
3764+
defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil, "FCVTPS">;
3765+
defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fceil, "FCVTPU">;
3766+
defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ffloor, "FCVTMS">;
3767+
defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ffloor, "FCVTMU">;
3768+
defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ftrunc, "FCVTZS">;
3769+
defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ftrunc, "FCVTZU">;
3770+
defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fround, "FCVTAS">;
3771+
defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">;
3772+
3773+
37393774

37403775
let Predicates = [HasFullFP16] in {
37413776
def : Pat<(i32 (lround f16:$Rn)),

llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll

Lines changed: 26 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
2+
; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
3+
; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
34

45
;
56
; 32-bit float to signed integer
@@ -106,19 +107,7 @@ define i19 @test_signed_i19_f32(float %f) nounwind {
106107
define i32 @test_signed_i32_f32(float %f) nounwind {
107108
; CHECK-LABEL: test_signed_i32_f32:
108109
; CHECK: // %bb.0:
109-
; CHECK-NEXT: mov w9, #-822083584
110-
; CHECK-NEXT: mov w11, #1325400063
111-
; CHECK-NEXT: fmov s1, w9
112-
; CHECK-NEXT: fcvtzs w8, s0
113-
; CHECK-NEXT: mov w10, #-2147483648
114-
; CHECK-NEXT: fcmp s0, s1
115-
; CHECK-NEXT: fmov s1, w11
116-
; CHECK-NEXT: mov w12, #2147483647
117-
; CHECK-NEXT: csel w8, w10, w8, lt
118-
; CHECK-NEXT: fcmp s0, s1
119-
; CHECK-NEXT: csel w8, w12, w8, gt
120-
; CHECK-NEXT: fcmp s0, s0
121-
; CHECK-NEXT: csel w0, wzr, w8, vs
110+
; CHECK-NEXT: fcvtzs w0, s0
122111
; CHECK-NEXT: ret
123112
%x = call i32 @llvm.fptosi.sat.i32.f32(float %f)
124113
ret i32 %x
@@ -148,19 +137,7 @@ define i50 @test_signed_i50_f32(float %f) nounwind {
148137
define i64 @test_signed_i64_f32(float %f) nounwind {
149138
; CHECK-LABEL: test_signed_i64_f32:
150139
; CHECK: // %bb.0:
151-
; CHECK-NEXT: mov w9, #-553648128
152-
; CHECK-NEXT: mov w11, #1593835519
153-
; CHECK-NEXT: fmov s1, w9
154-
; CHECK-NEXT: fcvtzs x8, s0
155-
; CHECK-NEXT: mov x10, #-9223372036854775808
156-
; CHECK-NEXT: fcmp s0, s1
157-
; CHECK-NEXT: fmov s1, w11
158-
; CHECK-NEXT: mov x12, #9223372036854775807
159-
; CHECK-NEXT: csel x8, x10, x8, lt
160-
; CHECK-NEXT: fcmp s0, s1
161-
; CHECK-NEXT: csel x8, x12, x8, gt
162-
; CHECK-NEXT: fcmp s0, s0
163-
; CHECK-NEXT: csel x0, xzr, x8, vs
140+
; CHECK-NEXT: fcvtzs x0, s0
164141
; CHECK-NEXT: ret
165142
%x = call i64 @llvm.fptosi.sat.i64.f32(float %f)
166143
ret i64 %x
@@ -330,16 +307,7 @@ define i19 @test_signed_i19_f64(double %f) nounwind {
330307
define i32 @test_signed_i32_f64(double %f) nounwind {
331308
; CHECK-LABEL: test_signed_i32_f64:
332309
; CHECK: // %bb.0:
333-
; CHECK-NEXT: mov x8, #-4476578029606273024
334-
; CHECK-NEXT: mov x9, #281474972516352
335-
; CHECK-NEXT: movk x9, #16863, lsl #48
336-
; CHECK-NEXT: fmov d1, x8
337-
; CHECK-NEXT: fmaxnm d1, d0, d1
338-
; CHECK-NEXT: fmov d2, x9
339-
; CHECK-NEXT: fminnm d1, d1, d2
340-
; CHECK-NEXT: fcvtzs w8, d1
341-
; CHECK-NEXT: fcmp d0, d0
342-
; CHECK-NEXT: csel w0, wzr, w8, vs
310+
; CHECK-NEXT: fcvtzs w0, d0
343311
; CHECK-NEXT: ret
344312
%x = call i32 @llvm.fptosi.sat.i32.f64(double %f)
345313
ret i32 %x
@@ -366,19 +334,7 @@ define i50 @test_signed_i50_f64(double %f) nounwind {
366334
define i64 @test_signed_i64_f64(double %f) nounwind {
367335
; CHECK-LABEL: test_signed_i64_f64:
368336
; CHECK: // %bb.0:
369-
; CHECK-NEXT: mov x9, #-4332462841530417152
370-
; CHECK-NEXT: mov x11, #4890909195324358655
371-
; CHECK-NEXT: fmov d1, x9
372-
; CHECK-NEXT: fcvtzs x8, d0
373-
; CHECK-NEXT: mov x10, #-9223372036854775808
374-
; CHECK-NEXT: fcmp d0, d1
375-
; CHECK-NEXT: fmov d1, x11
376-
; CHECK-NEXT: mov x12, #9223372036854775807
377-
; CHECK-NEXT: csel x8, x10, x8, lt
378-
; CHECK-NEXT: fcmp d0, d1
379-
; CHECK-NEXT: csel x8, x12, x8, gt
380-
; CHECK-NEXT: fcmp d0, d0
381-
; CHECK-NEXT: csel x0, xzr, x8, vs
337+
; CHECK-NEXT: fcvtzs x0, d0
382338
; CHECK-NEXT: ret
383339
%x = call i64 @llvm.fptosi.sat.i64.f64(double %f)
384340
ret i64 %x
@@ -550,23 +506,16 @@ define i19 @test_signed_i19_f16(half %f) nounwind {
550506
}
551507

552508
define i32 @test_signed_i32_f16(half %f) nounwind {
553-
; CHECK-LABEL: test_signed_i32_f16:
554-
; CHECK: // %bb.0:
555-
; CHECK-NEXT: mov w8, #-822083584
556-
; CHECK-NEXT: fcvt s0, h0
557-
; CHECK-NEXT: fmov s1, w8
558-
; CHECK-NEXT: mov w8, #1325400063
559-
; CHECK-NEXT: mov w9, #-2147483648
560-
; CHECK-NEXT: fcmp s0, s1
561-
; CHECK-NEXT: fmov s1, w8
562-
; CHECK-NEXT: fcvtzs w8, s0
563-
; CHECK-NEXT: csel w8, w9, w8, lt
564-
; CHECK-NEXT: mov w9, #2147483647
565-
; CHECK-NEXT: fcmp s0, s1
566-
; CHECK-NEXT: csel w8, w9, w8, gt
567-
; CHECK-NEXT: fcmp s0, s0
568-
; CHECK-NEXT: csel w0, wzr, w8, vs
569-
; CHECK-NEXT: ret
509+
; CHECK-CVT-LABEL: test_signed_i32_f16:
510+
; CHECK-CVT: // %bb.0:
511+
; CHECK-CVT-NEXT: fcvt s0, h0
512+
; CHECK-CVT-NEXT: fcvtzs w0, s0
513+
; CHECK-CVT-NEXT: ret
514+
;
515+
; CHECK-FP16-LABEL: test_signed_i32_f16:
516+
; CHECK-FP16: // %bb.0:
517+
; CHECK-FP16-NEXT: fcvtzs w0, h0
518+
; CHECK-FP16-NEXT: ret
570519
%x = call i32 @llvm.fptosi.sat.i32.f16(half %f)
571520
ret i32 %x
572521
}
@@ -594,23 +543,16 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
594543
}
595544

596545
define i64 @test_signed_i64_f16(half %f) nounwind {
597-
; CHECK-LABEL: test_signed_i64_f16:
598-
; CHECK: // %bb.0:
599-
; CHECK-NEXT: mov w8, #-553648128
600-
; CHECK-NEXT: fcvt s0, h0
601-
; CHECK-NEXT: fmov s1, w8
602-
; CHECK-NEXT: mov w8, #1593835519
603-
; CHECK-NEXT: mov x9, #-9223372036854775808
604-
; CHECK-NEXT: fcmp s0, s1
605-
; CHECK-NEXT: fmov s1, w8
606-
; CHECK-NEXT: fcvtzs x8, s0
607-
; CHECK-NEXT: csel x8, x9, x8, lt
608-
; CHECK-NEXT: mov x9, #9223372036854775807
609-
; CHECK-NEXT: fcmp s0, s1
610-
; CHECK-NEXT: csel x8, x9, x8, gt
611-
; CHECK-NEXT: fcmp s0, s0
612-
; CHECK-NEXT: csel x0, xzr, x8, vs
613-
; CHECK-NEXT: ret
546+
; CHECK-CVT-LABEL: test_signed_i64_f16:
547+
; CHECK-CVT: // %bb.0:
548+
; CHECK-CVT-NEXT: fcvt s0, h0
549+
; CHECK-CVT-NEXT: fcvtzs x0, s0
550+
; CHECK-CVT-NEXT: ret
551+
;
552+
; CHECK-FP16-LABEL: test_signed_i64_f16:
553+
; CHECK-FP16: // %bb.0:
554+
; CHECK-FP16-NEXT: fcvtzs x0, h0
555+
; CHECK-FP16-NEXT: ret
614556
%x = call i64 @llvm.fptosi.sat.i64.f16(half %f)
615557
ret i64 %x
616558
}

0 commit comments

Comments
 (0)