Skip to content

[AArch64][GlobalISel] Legalize fp128 types as libcalls for G_FCMP #98452

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,9 @@ class LegalizerHelper {
LegalizeResult createResetStateLibcall(MachineIRBuilder &MIRBuilder,
MachineInstr &MI,
LostDebugLocObserver &LocObserver);
LegalizeResult createFCMPLibcall(MachineIRBuilder &MIRBuilder,
MachineInstr &MI,
LostDebugLocObserver &LocObserver);

MachineInstrBuilder
getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,
Expand Down
154 changes: 152 additions & 2 deletions llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -723,8 +723,7 @@ static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
if (MemType.isVector())
return RTLIB::UNKNOWN_LIBCALL;

#define LCALLS(A, B) \
{ A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
#define LCALL5(A) \
LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
switch (Opc) {
Expand Down Expand Up @@ -980,6 +979,150 @@ LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
LocObserver, nullptr);
}

/// Returns the corresponding libcall for the given Pred and
/// the ICMP predicate that should be generated to compare with #0
/// after the libcall.
static std::pair<RTLIB::Libcall, CmpInst::Predicate>
getFCMPLibcallDesc(const CmpInst::Predicate Pred) {

switch (Pred) {
case CmpInst::FCMP_OEQ:
return {RTLIB::OEQ_F128, CmpInst::ICMP_EQ};
case CmpInst::FCMP_UNE:
return {RTLIB::UNE_F128, CmpInst::ICMP_NE};
case CmpInst::FCMP_OGE:
return {RTLIB::OGE_F128, CmpInst::ICMP_SGE};
case CmpInst::FCMP_OLT:
return {RTLIB::OLT_F128, CmpInst::ICMP_SLT};
case CmpInst::FCMP_OLE:
return {RTLIB::OLE_F128, CmpInst::ICMP_SLE};
case CmpInst::FCMP_OGT:
return {RTLIB::OGT_F128, CmpInst::ICMP_SGT};
case CmpInst::FCMP_UNO:
return {RTLIB::UO_F128, CmpInst::ICMP_NE};
default:
return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
}
}

LegalizerHelper::LegalizeResult
LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
MachineInstr &MI,
LostDebugLocObserver &LocObserver) {
auto &MF = MIRBuilder.getMF();
auto &Ctx = MF.getFunction().getContext();
const GFCmp *Cmp = cast<GFCmp>(&MI);

Copy link

@tschuett tschuett Jul 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GFCmp *Cmp = cast<GFCmp>(&MI);

to get access to the parameters.

LLT OpLLT = MRI.getType(Cmp->getLHSReg());
if (OpLLT != LLT::scalar(128) || OpLLT != MRI.getType(Cmp->getRHSReg()))
return UnableToLegalize;

Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);

// DstReg type is s32
const Register DstReg = Cmp->getReg(0);
const auto Cond = Cmp->getCond();

// Reference:
// https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
// Generates a libcall followed by ICMP.
const auto BuildLibcall =
[&](const RTLIB::Libcall Libcall, const CmpInst::Predicate ICmpPred,
const DstOp &Res = LLT::scalar(32)) -> Register {
// FCMP libcall always returns an i32, and needs an ICMP with #0.
constexpr LLT TempLLT = LLT::scalar(32);
Register Temp = MRI.createGenericVirtualRegister(TempLLT);
// Generate libcall, holding result in Temp
const auto Status = createLibcall(
MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
{{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
LocObserver, &MI);
if (!Status)
return {};

// Compare temp with #0 to get the final result.
return MIRBuilder
.buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
.getReg(0);
};

// Simple case if we have a direct mapping from predicate to libcall
if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond);
Libcall != RTLIB::UNKNOWN_LIBCALL &&
ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
return Legalized;
}
return UnableToLegalize;
}

// No direct mapping found, should be generated as combination of libcalls.

switch (Cond) {
case CmpInst::FCMP_UEQ: {
// FCMP_UEQ: unordered or equal
// Convert into (FCMP_OEQ || FCMP_UNO).

const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
const auto Oeq = BuildLibcall(OeqLibcall, OeqPred);

const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
const auto Uno = BuildLibcall(UnoLibcall, UnoPred);
if (Oeq && Uno)
MIRBuilder.buildOr(DstReg, Oeq, Uno);
else
return UnableToLegalize;

break;
}
case CmpInst::FCMP_ONE: {
// FCMP_ONE: ordered and operands are unequal
// Convert into (!FCMP_OEQ && !FCMP_UNO).

// We inverse the predicate instead of generating a NOT
// to save one instruction.
// On AArch64 isel can even select two cmp into a single ccmp.
const auto [OeqLibcall, OeqPred] = getFCMPLibcallDesc(CmpInst::FCMP_OEQ);
const auto NotOeq =
BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred));

const auto [UnoLibcall, UnoPred] = getFCMPLibcallDesc(CmpInst::FCMP_UNO);
const auto NotUno =
BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred));

if (NotOeq && NotUno)
MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
else
return UnableToLegalize;

break;
}
case CmpInst::FCMP_ULT:
case CmpInst::FCMP_UGE:
case CmpInst::FCMP_UGT:
case CmpInst::FCMP_ULE:
case CmpInst::FCMP_ORD: {
// Convert into: !(inverse(Pred))
// E.g. FCMP_ULT becomes !FCMP_OGE
// This is equivalent to the following, but saves some instructions.
// MIRBuilder.buildNot(
// PredTy,
// MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
// Op1, Op2));
const auto [InversedLibcall, InversedPred] =
getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond));
if (!BuildLibcall(InversedLibcall,
CmpInst::getInversePredicate(InversedPred), DstReg))
return UnableToLegalize;
break;
}
default:
return UnableToLegalize;
}

return Legalized;
}

// The function is used to legalize operations that set default environment
// state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
// On most targets supported in glibc FE_DFL_MODE is defined as
Expand Down Expand Up @@ -1120,6 +1263,13 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
return Status;
break;
}
case TargetOpcode::G_FCMP: {
LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
if (Status != Legalized)
return Status;
MI.eraseFromParent();
return Status;
}
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI: {
// FIXME: Support other types
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
})
.widenScalarOrEltToNextPow2(1)
.clampScalar(0, s32, s32)
.clampScalarOrElt(1, MinFPScalar, s64)
.minScalarOrElt(1, MinFPScalar)
.scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
.minScalarEltSameAsIf(
[=](const LegalityQuery &Query) {
const LLT &Ty = Query.Types[0];
Expand All @@ -572,7 +573,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampNumElements(1, v4s16, v8s16)
.clampNumElements(1, v2s32, v4s32)
.clampMaxNumElements(1, s64, 2)
.moreElementsToNextPow2(1);
.moreElementsToNextPow2(1)
.libcallFor({{s32, s128}});

// Extensions
auto ExtLegalFunc = [=](const LegalityQuery &Query) {
Expand Down
70 changes: 46 additions & 24 deletions llvm/test/CodeGen/AArch64/arm64-ccmp.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,GISEL
; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL
target triple = "arm64-apple-ios"

define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
Expand Down Expand Up @@ -950,29 +950,51 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead.

define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 {
; CHECK-LABEL: f128_select_and_olt_oge:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #80
; CHECK-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
; CHECK-NEXT: mov x19, x1
; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill
; CHECK-NEXT: bl ___lttf2
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: cset w21, lt
; CHECK-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload
; CHECK-NEXT: bl ___getf2
; CHECK-NEXT: cmp w0, #0
; CHECK-NEXT: cset w8, ge
; CHECK-NEXT: tst w8, w21
; CHECK-NEXT: csel w0, w20, w19, ne
; CHECK-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
; CHECK-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
; SDISEL-LABEL: f128_select_and_olt_oge:
; SDISEL: ; %bb.0:
; SDISEL-NEXT: sub sp, sp, #80
; SDISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
; SDISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
; SDISEL-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
; SDISEL-NEXT: mov x19, x1
; SDISEL-NEXT: mov x20, x0
; SDISEL-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill
; SDISEL-NEXT: bl ___lttf2
; SDISEL-NEXT: cmp w0, #0
; SDISEL-NEXT: cset w21, lt
; SDISEL-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload
; SDISEL-NEXT: bl ___getf2
; SDISEL-NEXT: cmp w0, #0
; SDISEL-NEXT: cset w8, ge
; SDISEL-NEXT: tst w8, w21
; SDISEL-NEXT: csel w0, w20, w19, ne
; SDISEL-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
; SDISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
; SDISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
; SDISEL-NEXT: add sp, sp, #80
; SDISEL-NEXT: ret
;
; GISEL-LABEL: f128_select_and_olt_oge:
; GISEL: ; %bb.0:
; GISEL-NEXT: sub sp, sp, #80
; GISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill
; GISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill
; GISEL-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill
; GISEL-NEXT: stp q3, q2, [sp] ; 32-byte Folded Spill
; GISEL-NEXT: mov x19, x0
; GISEL-NEXT: mov x20, x1
; GISEL-NEXT: bl ___lttf2
; GISEL-NEXT: mov x21, x0
; GISEL-NEXT: ldp q1, q0, [sp] ; 32-byte Folded Reload
; GISEL-NEXT: bl ___getf2
; GISEL-NEXT: cmp w21, #0
; GISEL-NEXT: ccmp w0, #0, #8, lt
; GISEL-NEXT: csel w0, w19, w20, ge
; GISEL-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
; GISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
; GISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
; GISEL-NEXT: add sp, sp, #80
; GISEL-NEXT: ret
%c0 = fcmp olt fp128 %v0, %v1
%c1 = fcmp oge fp128 %v2, %v3
%cr = and i1 %c1, %c0
Expand Down
Loading
Loading