Skip to content

Commit ba461f8

Browse files
authored
[AArch64][GlobalISel] Legalize fp128 types as libcalls for G_FCMP (#98452)
- Generate libcall for supported predicates. - Generate unsupported predicates as combinations of supported predicates. - Vectors are scalarized, however some cases like `v3f128_fp128` are still failing, because we failed to legalize G_OR for these types. GISel now generates the same code as SDAG, however, note the difference in the `one` case.
1 parent 6f37d42 commit ba461f8

File tree

6 files changed

+1153
-255
lines changed

6 files changed

+1153
-255
lines changed

llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,9 @@ class LegalizerHelper {
280280
LegalizeResult createResetStateLibcall(MachineIRBuilder &MIRBuilder,
281281
MachineInstr &MI,
282282
LostDebugLocObserver &LocObserver);
283+
LegalizeResult createFCMPLibcall(MachineIRBuilder &MIRBuilder,
284+
MachineInstr &MI,
285+
LostDebugLocObserver &LocObserver);
283286

284287
MachineInstrBuilder
285288
getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 152 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -735,8 +735,7 @@ static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
735735
if (MemType.isVector())
736736
return RTLIB::UNKNOWN_LIBCALL;
737737

738-
#define LCALLS(A, B) \
739-
{ A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
738+
#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
740739
#define LCALL5(A) \
741740
LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
742741
switch (Opc) {
@@ -992,6 +991,150 @@ LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
992991
LocObserver, nullptr);
993992
}
994993

994+
/// Returns the corresponding libcall for the given Pred and
995+
/// the ICMP predicate that should be generated to compare with #0
996+
/// after the libcall.
997+
static std::pair<RTLIB::Libcall, CmpInst::Predicate>
998+
getFCMPLibcallDesc(const CmpInst::Predicate Pred) {
999+
1000+
switch (Pred) {
1001+
case CmpInst::FCMP_OEQ:
1002+
return {RTLIB::OEQ_F128, CmpInst::ICMP_EQ};
1003+
case CmpInst::FCMP_UNE:
1004+
return {RTLIB::UNE_F128, CmpInst::ICMP_NE};
1005+
case CmpInst::FCMP_OGE:
1006+
return {RTLIB::OGE_F128, CmpInst::ICMP_SGE};
1007+
case CmpInst::FCMP_OLT:
1008+
return {RTLIB::OLT_F128, CmpInst::ICMP_SLT};
1009+
case CmpInst::FCMP_OLE:
1010+
return {RTLIB::OLE_F128, CmpInst::ICMP_SLE};
1011+
case CmpInst::FCMP_OGT:
1012+
return {RTLIB::OGT_F128, CmpInst::ICMP_SGT};
1013+
case CmpInst::FCMP_UNO:
1014+
return {RTLIB::UO_F128, CmpInst::ICMP_NE};
1015+
default:
1016+
return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
1017+
}
1018+
}
1019+
1020+
LegalizerHelper::LegalizeResult
1021+
LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
1022+
MachineInstr &MI,
1023+
LostDebugLocObserver &LocObserver) {
1024+
auto &MF = MIRBuilder.getMF();
1025+
auto &Ctx = MF.getFunction().getContext();
1026+
const GFCmp *Cmp = cast<GFCmp>(&MI);
1027+
1028+
LLT OpLLT = MRI.getType(Cmp->getLHSReg());
1029+
if (OpLLT != LLT::scalar(128) || OpLLT != MRI.getType(Cmp->getRHSReg()))
1030+
return UnableToLegalize;
1031+
1032+
Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
1033+
1034+
// DstReg type is s32
1035+
const Register DstReg = Cmp->getReg(0);
1036+
const auto Cond = Cmp->getCond();
1037+
1038+
// Reference:
1039+
// https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
1040+
// Generates a libcall followed by ICMP.
1041+
const auto BuildLibcall =
1042+
[&](const RTLIB::Libcall Libcall, const CmpInst::Predicate ICmpPred,
1043+
const DstOp &Res = LLT::scalar(32)) -> Register {
1044+
// FCMP libcall always returns an i32, and needs an ICMP with #0.
1045+
constexpr LLT TempLLT = LLT::scalar(32);
1046+
Register Temp = MRI.createGenericVirtualRegister(TempLLT);
1047+
// Generate libcall, holding result in Temp
1048+
const auto Status = createLibcall(
1049+
MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
1050+
{{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
1051+
LocObserver, &MI);
1052+
if (!Status)
1053+
return {};
1054+
1055+
// Compare temp with #0 to get the final result.
1056+
return MIRBuilder
1057+
.buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
1058+
.getReg(0);
1059+
};
1060+
1061+
// Simple case if we have a direct mapping from predicate to libcall
1062+
if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond);
1063+
Libcall != RTLIB::UNKNOWN_LIBCALL &&
1064+
ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
1065+
if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
1066+
return Legalized;
1067+
}
1068+
return UnableToLegalize;
1069+
}
1070+
1071+
// No direct mapping found, should be generated as combination of libcalls.
1072+
1073+
switch (Cond) {
1074+
case CmpInst::FCMP_UEQ: {
1075+
// FCMP_UEQ: unordered or equal
1076+
// Convert into (FCMP_OEQ || FCMP_UNO).
1077+
1078+
const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
1079+
const auto Oeq = BuildLibcall(OeqLibcall, OeqPred);
1080+
1081+
const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
1082+
const auto Uno = BuildLibcall(UnoLibcall, UnoPred);
1083+
if (Oeq && Uno)
1084+
MIRBuilder.buildOr(DstReg, Oeq, Uno);
1085+
else
1086+
return UnableToLegalize;
1087+
1088+
break;
1089+
}
1090+
case CmpInst::FCMP_ONE: {
1091+
// FCMP_ONE: ordered and operands are unequal
1092+
// Convert into (!FCMP_OEQ && !FCMP_UNO).
1093+
1094+
// We inverse the predicate instead of generating a NOT
1095+
// to save one instruction.
1096+
// On AArch64 isel can even select two cmp into a single ccmp.
1097+
const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
1098+
const auto NotOeq =
1099+
BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred));
1100+
1101+
const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
1102+
const auto NotUno =
1103+
BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred));
1104+
1105+
if (NotOeq && NotUno)
1106+
MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
1107+
else
1108+
return UnableToLegalize;
1109+
1110+
break;
1111+
}
1112+
case CmpInst::FCMP_ULT:
1113+
case CmpInst::FCMP_UGE:
1114+
case CmpInst::FCMP_UGT:
1115+
case CmpInst::FCMP_ULE:
1116+
case CmpInst::FCMP_ORD: {
1117+
// Convert into: !(inverse(Pred))
1118+
// E.g. FCMP_ULT becomes !FCMP_OGE
1119+
// This is equivalent to the following, but saves some instructions.
1120+
// MIRBuilder.buildNot(
1121+
// PredTy,
1122+
// MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
1123+
// Op1, Op2));
1124+
const auto [InversedLibcall, InversedPred] =
1125+
getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond));
1126+
if (!BuildLibcall(InversedLibcall,
1127+
CmpInst::getInversePredicate(InversedPred), DstReg))
1128+
return UnableToLegalize;
1129+
break;
1130+
}
1131+
default:
1132+
return UnableToLegalize;
1133+
}
1134+
1135+
return Legalized;
1136+
}
1137+
9951138
// The function is used to legalize operations that set default environment
9961139
// state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
9971140
// On most targets supported in glibc FE_DFL_MODE is defined as
@@ -1138,6 +1281,13 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
11381281
return Status;
11391282
break;
11401283
}
1284+
case TargetOpcode::G_FCMP: {
1285+
LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
1286+
if (Status != Legalized)
1287+
return Status;
1288+
MI.eraseFromParent();
1289+
return Status;
1290+
}
11411291
case TargetOpcode::G_FPTOSI:
11421292
case TargetOpcode::G_FPTOUI: {
11431293
// FIXME: Support other types

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
561561
})
562562
.widenScalarOrEltToNextPow2(1)
563563
.clampScalar(0, s32, s32)
564-
.clampScalarOrElt(1, MinFPScalar, s64)
564+
.minScalarOrElt(1, MinFPScalar)
565+
.scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
565566
.minScalarEltSameAsIf(
566567
[=](const LegalityQuery &Query) {
567568
const LLT &Ty = Query.Types[0];
@@ -573,7 +574,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
573574
.clampNumElements(1, v4s16, v8s16)
574575
.clampNumElements(1, v2s32, v4s32)
575576
.clampMaxNumElements(1, s64, 2)
576-
.moreElementsToNextPow2(1);
577+
.moreElementsToNextPow2(1)
578+
.libcallFor({{s32, s128}});
577579

578580
// Extensions
579581
auto ExtLegalFunc = [=](const LegalityQuery &Query) {

llvm/test/CodeGen/AArch64/arm64-ccmp.ll

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL
3-
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,GISEL
3+
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL
44
target triple = "arm64-apple-ios"
55

66
define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
@@ -950,29 +950,51 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
950950
; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead.
951951

952952
define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 {
953-
; CHECK-LABEL: f128_select_and_olt_oge:
954-
; CHECK: ; %bb.0:
955-
; CHECK-NEXT: sub sp, sp, #80
956-
; CHECK-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
957-
; CHECK-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
958-
; CHECK-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
959-
; CHECK-NEXT: mov x19, x1
960-
; CHECK-NEXT: mov x20, x0
961-
; CHECK-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill
962-
; CHECK-NEXT: bl ___lttf2
963-
; CHECK-NEXT: cmp w0, #0
964-
; CHECK-NEXT: cset w21, lt
965-
; CHECK-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload
966-
; CHECK-NEXT: bl ___getf2
967-
; CHECK-NEXT: cmp w0, #0
968-
; CHECK-NEXT: cset w8, ge
969-
; CHECK-NEXT: tst w8, w21
970-
; CHECK-NEXT: csel w0, w20, w19, ne
971-
; CHECK-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
972-
; CHECK-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
973-
; CHECK-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
974-
; CHECK-NEXT: add sp, sp, #80
975-
; CHECK-NEXT: ret
953+
; SDISEL-LABEL: f128_select_and_olt_oge:
954+
; SDISEL: ; %bb.0:
955+
; SDISEL-NEXT: sub sp, sp, #80
956+
; SDISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
957+
; SDISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
958+
; SDISEL-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
959+
; SDISEL-NEXT: mov x19, x1
960+
; SDISEL-NEXT: mov x20, x0
961+
; SDISEL-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill
962+
; SDISEL-NEXT: bl ___lttf2
963+
; SDISEL-NEXT: cmp w0, #0
964+
; SDISEL-NEXT: cset w21, lt
965+
; SDISEL-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload
966+
; SDISEL-NEXT: bl ___getf2
967+
; SDISEL-NEXT: cmp w0, #0
968+
; SDISEL-NEXT: cset w8, ge
969+
; SDISEL-NEXT: tst w8, w21
970+
; SDISEL-NEXT: csel w0, w20, w19, ne
971+
; SDISEL-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
972+
; SDISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
973+
; SDISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
974+
; SDISEL-NEXT: add sp, sp, #80
975+
; SDISEL-NEXT: ret
976+
;
977+
; GISEL-LABEL: f128_select_and_olt_oge:
978+
; GISEL: ; %bb.0:
979+
; GISEL-NEXT: sub sp, sp, #80
980+
; GISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
981+
; GISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
982+
; GISEL-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
983+
; GISEL-NEXT: stp q3, q2, [sp] ; 32-byte Folded Spill
984+
; GISEL-NEXT: mov x19, x0
985+
; GISEL-NEXT: mov x20, x1
986+
; GISEL-NEXT: bl ___lttf2
987+
; GISEL-NEXT: mov x21, x0
988+
; GISEL-NEXT: ldp q1, q0, [sp] ; 32-byte Folded Reload
989+
; GISEL-NEXT: bl ___getf2
990+
; GISEL-NEXT: cmp w21, #0
991+
; GISEL-NEXT: ccmp w0, #0, #8, lt
992+
; GISEL-NEXT: csel w0, w19, w20, ge
993+
; GISEL-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
994+
; GISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
995+
; GISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
996+
; GISEL-NEXT: add sp, sp, #80
997+
; GISEL-NEXT: ret
976998
%c0 = fcmp olt fp128 %v0, %v1
977999
%c1 = fcmp oge fp128 %v2, %v3
9781000
%cr = and i1 %c1, %c0

0 commit comments

Comments
 (0)