-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][GlobalISel] Legalize 128-bit types for FABS #104753
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: Him188 (Him188) ChangesGenerate This approach is different than what SDAG is doing. SDAG stores the value onto stack, clears the sign bit in the most significant byte, and loads the value back into register. This involves multiple memory ops and sounds slower. Full diff: https://github.com/llvm/llvm-project/pull/104753.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d42d5511a82422..cae0ce1f4567f2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -241,9 +241,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(0);
getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG,
- G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM,
- G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR,
- G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC,
+ G_FSQRT, G_FMAXNUM, G_FMINNUM, G_FMAXIMUM,
+ G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT,
+ G_FNEARBYINT, G_INTRINSIC_TRUNC,
G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
.legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
.legalIf([=](const LegalityQuery &Query) {
@@ -257,6 +257,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampNumElements(0, v2s64, v2s64)
.moreElementsToNextPow2(0);
+ getActionDefinitionsBuilder(G_FABS)
+ .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
+ .legalIf([=](const LegalityQuery &Query) {
+ const auto &Ty = Query.Types[0];
+ return (Ty == v8s16 || Ty == v4s16) && HasFP16;
+ })
+ .customFor({s128})
+ .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+ .minScalarOrElt(0, MinFPScalar)
+ .clampNumElements(0, v4s16, v8s16)
+ .clampNumElements(0, v2s32, v4s32)
+ .clampNumElements(0, v2s64, v2s64)
+ .moreElementsToNextPow2(0);
+
getActionDefinitionsBuilder(G_FREM)
.libcallFor({s32, s64})
.minScalar(0, s32)
@@ -1336,6 +1350,8 @@ bool AArch64LegalizerInfo::legalizeCustom(
return legalizePrefetch(MI, Helper);
case TargetOpcode::G_ABS:
return Helper.lowerAbsToCNeg(MI);
+ case TargetOpcode::G_FABS:
+ return legalizeFABS(MI, MRI, MIRBuilder);
case TargetOpcode::G_ICMP:
return legalizeICMP(MI, MRI, MIRBuilder);
}
@@ -1396,6 +1412,25 @@ bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
return true;
}
+bool AArch64LegalizerInfo::legalizeFABS(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+
+ constexpr LLT S128 = LLT::scalar(128);
+ if (MRI.getType(SrcReg) != S128 || MRI.getType(DstReg) != S128)
+ return false;
+
+ MIRBuilder.buildAnd(
+ DstReg, SrcReg,
+ MIRBuilder.buildConstant(
+ S128, APInt::getSignedMaxValue(128)));
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 00d85a36e4b2ca..8bf642d1745aa9 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -50,6 +50,8 @@ class AArch64LegalizerInfo : public LegalizerInfo {
LegalizerHelper &Helper) const;
bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
+ bool legalizeFABS(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
bool legalizeICMP(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder) const;
bool legalizeFunnelShift(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/test/CodeGen/AArch64/fabs-fp128.ll b/llvm/test/CodeGen/AArch64/fabs-fp128.ll
new file mode 100644
index 00000000000000..131af5e0a3281e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fabs-fp128.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define fp128 @fabs_f128(fp128 %a) {
+; CHECK-SD-LABEL: fabs_f128:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: str q0, [sp, #-16]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: ldrb w8, [sp, #15]
+; CHECK-SD-NEXT: and w8, w8, #0x7f
+; CHECK-SD-NEXT: strb w8, [sp, #15]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fabs_f128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
+; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: ret
+entry:
+ %c = call fp128 @llvm.fabs.f128(fp128 %a)
+ ret fp128 %c
+}
+
+define <1 x fp128> @fabs_v1f128(<1 x fp128> %a) {
+; CHECK-SD-LABEL: fabs_v1f128:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: str q0, [sp, #-16]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: ldrb w8, [sp, #15]
+; CHECK-SD-NEXT: and w8, w8, #0x7f
+; CHECK-SD-NEXT: strb w8, [sp, #15]
+; CHECK-SD-NEXT: ldr q0, [sp], #16
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fabs_v1f128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
+; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: ret
+entry:
+ %c = call <1 x fp128> @llvm.fabs.v1f128(<1 x fp128> %a)
+ ret <1 x fp128> %c
+}
+
+define <2 x fp128> @fabs_v2f128(<2 x fp128> %a) {
+; CHECK-SD-LABEL: fabs_v2f128:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT: ldrb w8, [sp, #15]
+; CHECK-SD-NEXT: and w8, w8, #0x7f
+; CHECK-SD-NEXT: strb w8, [sp, #15]
+; CHECK-SD-NEXT: ldrb w8, [sp, #31]
+; CHECK-SD-NEXT: and w8, w8, #0x7f
+; CHECK-SD-NEXT: strb w8, [sp, #31]
+; CHECK-SD-NEXT: ldp q0, q1, [sp], #32
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fabs_v2f128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov x8, v0.d[1]
+; CHECK-GI-NEXT: mov x9, v1.d[1]
+; CHECK-GI-NEXT: mov v0.d[0], v0.d[0]
+; CHECK-GI-NEXT: mov v1.d[0], v1.d[0]
+; CHECK-GI-NEXT: and x8, x8, #0x7fffffffffffffff
+; CHECK-GI-NEXT: and x9, x9, #0x7fffffffffffffff
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: mov v1.d[1], x9
+; CHECK-GI-NEXT: ret
+entry:
+ %c = call <2 x fp128> @llvm.fabs.v2f128(<2 x fp128> %a)
+ ret <2 x fp128> %c
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
const auto &Ty = Query.Types[0]; | ||
return (Ty == v8s16 || Ty == v4s16) && HasFP16; | ||
}) | ||
.customFor({s128}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it possible to make it lowerIf and have the generic LegalizerHelper::lower handle the expansion into an AND? It should be very similar code I hope (maybe generalized to more types), but could be useful to other targets / fp types too. FP16 without +fullfp16 might be able to use it, for example.
MIRBuilder.buildAnd( | ||
DstReg, SrcReg, | ||
MIRBuilder.buildConstant( | ||
S128, APInt::getSignedMaxValue(128))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should already be the default lower action. This shouldn't require custom lowering
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I failed to find such a handling. lowerFor({128})
also failed without adding a lower action. So I think there isn't a default lower action for G_FABS.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So you should put this in the generic LegalizerHelper. As a rule, if your custom lowering only emits generic instructions, it should not be in a specific target
if (MRI.getType(SrcReg) != Ty) | ||
return UnableToLegalize; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove this check, the IR is malformed if this happens
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed
if (!Ty.isScalar()) | ||
return UnableToLegalize; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need this, this code should handle vectors just fine
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm with nit
MIRBuilder.buildConstant( | ||
Ty, APInt::getSignedMaxValue(Ty.getSizeInBits()))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not 100% sure this works for ppc_fp128 but we don't support that yet anyway
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ty.getScalarSizeInBits()
to support vectors? And vector support is not testet.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tschuett How do I test vectors? Is there a way to unit-test the helper or do I need to find a target which does use the helper for vectors (AArch64 does not).
But if we remove the scalarizeIf
in AArch64 then AArch64 will use the helper and hence this can be tested
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct.
getActionDefinitionsBuilder(G_FABS) | ||
.legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64}) | ||
.legalIf([=](const LegalityQuery &Query) { | ||
const auto &Ty = Query.Types[0]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no auto, and no reference. this is just LLT. You can also express this as a conditional legalFor, or swap out the legalFor list based on the HasFP16 feature
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was actually copied from above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but that could also be better. These predicates can be called many times so they ideally should be as simple as possible to get to the legal case
return (Ty == v8s16 || Ty == v4s16) && HasFP16; | ||
}) | ||
.lowerFor({s128}) | ||
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you need the scalarizeIf
when lowerFAbs
supports vectors?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
G_AND does not support s128 so we still need this. Left a TODO.
LLVM ERROR: unable to legalize instruction: %3:_(<2 x s128>) = G_AND %0:_, %7:_ (in function: fabs_v2f128)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I happened to be adding i128 support for add/sub recently, and and/or/xor was caught up in them. There might not be a lot of benefit at the moment choosing one method vs the other as we will just scalarize in either case. That could be improved in the future if we change how they lower.
; CHECK-GI-NEXT: mov v1.d[1], x9 | ||
; CHECK-GI-NEXT: ret | ||
entry: | ||
%c = call <2 x fp128> @llvm.fabs.v2f128(<2 x fp128> %a) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While at it <4 x fp128>
would be nice (low priority).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One moment later, non-power of two would be even nicer, e.g., <3 x fp128>
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we scalarize vectors so I assumed the tests are not necessary, but yes let me add them to be safer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we do not scalarize in the G_FABS legalizer, some other component will do the scalarization for us.
// A legality predicate that returns true if the subtarget has FP16 support. | ||
// To be used in combination with other predicates, e.g: | ||
// .legalIf(all(hasFP16(), typeInSet(0, {v8s16, v4s16}))) | ||
const auto hasFP16 = [=]() -> LegalityPredicate { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This feels like it has more layers than it needs.
But it might be worth keeping as it was before. I feel like we will have a lot of conditions like these, and we should have something like .legalFor(hasFP16, {v4f16, v8f16})
return (Ty == v8s16 || Ty == v4s16) && HasFP16; | ||
}) | ||
.lowerFor({s128}) | ||
.scalarizeIf(scalarOrEltWiderThan(0, 64), 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I happened to be adding i128 support for add/sub recently, and and/or/xor was caught up in them. There might not be a lot of benefit at the moment choosing one method vs the other as we will just scalarize in either case. That could be improved in the future if we change how they lower.
; RUN: llc -mtriple=aarch64-unknown-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD | ||
; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need -verify-machineinstrs. Also should use -global-isel=0 if you're explicitly trying to handle both selectors
- Generate AND to clear sign bit for s128 - Vectors are scalarized
- Support vectors in lowerFAbs - Simplify legality predicates
f2cff01
to
dd106e1
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/2230 Here is the relevant piece of the build log for the reference
|
This patch adds a common lower action for
G_FABS
, which generatesand x8, x8, #0x7fffffffffffffff
to reset the sign bit. The action does not support vectors sinceG_AND
does not support fp128.This approach is different than what SDAG is doing. SDAG stores the value onto stack, clears the sign bit in the most significant byte, and loads the value back into register. This involves multiple memory ops and sounds slower.