-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] optimise SVE cmp intrinsics with no active lanes #104779
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1160,6 +1160,10 @@ static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, | |
IntrinsicInst &II) { | ||
LLVMContext &Ctx = II.getContext(); | ||
|
||
// Replace by zero constant when all lanes are inactive | ||
if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II)) | ||
return II_NA; | ||
|
||
// Check that the predicate is all active | ||
auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); | ||
if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | ||
|
@@ -2131,6 +2135,27 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, | |
case Intrinsic::aarch64_sve_st4: | ||
case Intrinsic::aarch64_sve_st4q: | ||
return instCombineSVENoActiveUnaryErase(IC, II, 4); | ||
case Intrinsic::aarch64_sve_cmpeq: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good point. AAPCS says this:
I think intrinsic call is a public interface, but I am not sure. @paulwalker-arm could you help clarify this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The compare intrinsics (and C builtins) have no side effects and so the fact some of them may code generate to instructions that set the condition codes is not relevant. From the ACLE's point of view the result of a compare builtin should be tested via the |
||
case Intrinsic::aarch64_sve_cmpeq_wide: | ||
case Intrinsic::aarch64_sve_cmpge: | ||
case Intrinsic::aarch64_sve_cmpge_wide: | ||
case Intrinsic::aarch64_sve_cmpgt: | ||
case Intrinsic::aarch64_sve_cmpgt_wide: | ||
case Intrinsic::aarch64_sve_cmphi: | ||
case Intrinsic::aarch64_sve_cmphi_wide: | ||
case Intrinsic::aarch64_sve_cmphs: | ||
case Intrinsic::aarch64_sve_cmphs_wide: | ||
case Intrinsic::aarch64_sve_cmple_wide: | ||
case Intrinsic::aarch64_sve_cmplo_wide: | ||
case Intrinsic::aarch64_sve_cmpls_wide: | ||
case Intrinsic::aarch64_sve_cmplt_wide: | ||
case Intrinsic::aarch64_sve_facge: | ||
case Intrinsic::aarch64_sve_facgt: | ||
case Intrinsic::aarch64_sve_fcmpeq: | ||
case Intrinsic::aarch64_sve_fcmpge: | ||
case Intrinsic::aarch64_sve_fcmpgt: | ||
case Intrinsic::aarch64_sve_fcmpne: | ||
case Intrinsic::aarch64_sve_fcmpuo: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we missing floating point absolute comparisons here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
case Intrinsic::aarch64_sve_ld1_gather: | ||
case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: | ||
case Intrinsic::aarch64_sve_ld1_gather_sxtw: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: opt -S -passes=instcombine < %s | FileCheck %s | ||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
define <vscale x 16 x i1> @test_cmpeq(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there any specific reason why the tests are not covering all allowed element types? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason is that all different element types will in the end map to the same intrinsic anyway. So if this optimisation works for one type it will work for all. |
||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpeq( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpeq.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmpeq_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpeq_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpeq.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmpge(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpge( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmpge_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpge_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmpgt(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpgt( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpgt.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmpgt_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpgt_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpgt.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmphi(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphi( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphi.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmphi_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphi_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphi.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmphs(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphs( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphs.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmphs_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmphs_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmphs.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmple_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmple_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmple.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmplo_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmplo_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmplo.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmpls_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpls_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpls.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmplt_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmplt_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmplt.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmpne(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpne( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 16 x i1> @test_cmpne_wide(<vscale x 16 x i8> %a, <vscale x 2 x i64> %b){ | ||
; CHECK-LABEL: define <vscale x 16 x i1> @test_cmpne_wide( | ||
; CHECK-SAME: <vscale x 16 x i8> [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 16 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpne.wide.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) | ||
ret <vscale x 16 x i1> %0 | ||
} | ||
|
||
define <vscale x 8 x i1> @test_facge(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ry: | ||
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.facge.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b) | ||
ret <vscale x 8 x i1> %0 | ||
} | ||
|
||
define <vscale x 8 x i1> @test_facgt(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ry: | ||
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.facgt.nxv8f16(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b) | ||
ret <vscale x 8 x i1> %0 | ||
} | ||
|
||
define <vscale x 8 x i1> @test_fcmpeq(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ | ||
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpeq( | ||
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpeq.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b) | ||
ret <vscale x 8 x i1> %0 | ||
} | ||
|
||
define <vscale x 8 x i1> @test_fcmpge(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ | ||
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpge( | ||
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpge.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b) | ||
ret <vscale x 8 x i1> %0 | ||
} | ||
|
||
define <vscale x 8 x i1> @test_fcmpgt(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ | ||
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpgt( | ||
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpgt.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b) | ||
ret <vscale x 8 x i1> %0 | ||
} | ||
|
||
define <vscale x 8 x i1> @test_fcmpne(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ | ||
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpne( | ||
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpne.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b) | ||
ret <vscale x 8 x i1> %0 | ||
} | ||
|
||
define <vscale x 8 x i1> @test_fcmpuo(<vscale x 8 x half> %a, <vscale x 8 x half> %b){ | ||
; CHECK-LABEL: define <vscale x 8 x i1> @test_fcmpuo( | ||
; CHECK-SAME: <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) { | ||
; CHECK-NEXT: [[ENTRY:.*:]] | ||
; CHECK-NEXT: ret <vscale x 8 x i1> zeroinitializer | ||
; | ||
entry: | ||
%0 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.fcmpuo.nxv16i8(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x half> %a, <vscale x 8 x half> %b) | ||
ret <vscale x 8 x i1> %0 | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not relevant to this PR but
instCombineSVENoActiveUnaryZero
seems like a bad name given the context in which the function is now used. Do you mind committing a NFC rename patch to something likeinstCombineSVENoActiveLanes
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay I will try coming up with better naming scheme for this. It cannot be just
instCombineSVENoActiveLanes
as there are multiple different things that can happen when all lanes are inactive, but using it as a base can work.