-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[LLVM][SVE] Improve code generation for i1 based int_to_fp operations. #129229
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Rather than extending the predicate we can simply use it to select between the two possible results.
@llvm/pr-subscribers-backend-aarch64 Author: Paul Walker (paulwalker-arm) ChangesRather than extending the predicate we can use it directly to select between the two possible results. Full diff: https://github.com/llvm/llvm-project/pull/129229.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7a471662ea075..110a592df2d2d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5034,11 +5034,10 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
if (VT.isScalableVector()) {
if (InVT.getVectorElementType() == MVT::i1) {
- // We can't directly extend an SVE predicate; extend it first.
- unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- EVT CastVT = getPromotedVTForPredicate(InVT);
- In = DAG.getNode(CastOpc, dl, CastVT, In);
- return DAG.getNode(Opc, dl, VT, In);
+ SDValue FalseVal = DAG.getConstantFP(0.0, dl, VT);
+ SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, dl, VT)
+ : DAG.getConstantFP(1.0, dl, VT);
+ return DAG.getNode(ISD::VSELECT, dl, VT, In, TrueVal, FalseVal);
}
unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8255b267bd7e9..8d2e7f4a8ed10 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5452,6 +5452,19 @@ multiclass sve_int_dup_fpimm_pred<string asm> {
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, fpimm32:$imm8), 1>;
def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, fpimm64:$imm8), 1>;
+
+ def : Pat<(nxv8f16 (vselect nxv8i1:$pg, (splat_vector fpimm16:$imm8), nxv8f16:$zd)),
+ (!cast<Instruction>(NAME # _H) $zd, $pg, fpimm16:$imm8)>;
+ def : Pat<(nxv4f16 (vselect nxv4i1:$pg, (splat_vector fpimm16:$imm8), nxv4f16:$zd)),
+ (!cast<Instruction>(NAME # _H) $zd, $pg, fpimm16:$imm8)>;
+ def : Pat<(nxv2f16 (vselect nxv2i1:$pg, (splat_vector fpimm16:$imm8), nxv2f16:$zd)),
+ (!cast<Instruction>(NAME # _H) $zd, $pg, fpimm16:$imm8)>;
+ def : Pat<(nxv4f32 (vselect nxv4i1:$pg, (splat_vector fpimm32:$imm8), nxv4f32:$zd)),
+ (!cast<Instruction>(NAME # _S) $zd, $pg, fpimm32:$imm8)>;
+ def : Pat<(nxv2f32 (vselect nxv2i1:$pg, (splat_vector fpimm32:$imm8), nxv2f32:$zd)),
+ (!cast<Instruction>(NAME # _S) $zd, $pg, fpimm32:$imm8)>;
+ def : Pat<(nxv2f64 (vselect nxv2i1:$pg, (splat_vector fpimm64:$imm8), nxv2f64:$zd)),
+ (!cast<Instruction>(NAME # _D) $zd, $pg, fpimm64:$imm8)>;
}
class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
index fc5128fffad36..a6749984af427 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
@@ -454,9 +454,8 @@ define <vscale x 2 x i64> @fcvtzu_d_nxv2f64(<vscale x 2 x double> %a) {
define <vscale x 2 x half> @scvtf_h_nxv2i1(<vscale x 2 x i1> %a) {
; CHECK-LABEL: scvtf_h_nxv2i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x half>
ret <vscale x 2 x half> %res
@@ -495,9 +494,8 @@ define <vscale x 2 x half> @scvtf_h_nxv2i64(<vscale x 2 x i64> %a) {
define <vscale x 3 x half> @scvtf_h_nxv3i1(<vscale x 3 x i1> %a) {
; CHECK-LABEL: scvtf_h_nxv3i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 3 x i1> %a to <vscale x 3 x half>
ret <vscale x 3 x half> %res
@@ -516,9 +514,8 @@ define <vscale x 3 x half> @scvtf_h_nxv3i16(<vscale x 3 x i16> %a) {
define <vscale x 4 x half> @scvtf_h_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: scvtf_h_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x half>
ret <vscale x 4 x half> %res
@@ -547,9 +544,8 @@ define <vscale x 4 x half> @scvtf_h_nxv4i32(<vscale x 4 x i32> %a) {
define <vscale x 7 x half> @scvtf_h_nxv7i1(<vscale x 7 x i1> %a) {
; CHECK-LABEL: scvtf_h_nxv7i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 7 x i1> %a to <vscale x 7 x half>
ret <vscale x 7 x half> %res
@@ -568,9 +564,8 @@ define <vscale x 7 x half> @scvtf_h_nxv7i16(<vscale x 7 x i16> %a) {
define <vscale x 8 x half> @scvtf_h_nxv8i1(<vscale x 8 x i1> %a) {
; CHECK-LABEL: scvtf_h_nxv8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 8 x i1> %a to <vscale x 8 x half>
ret <vscale x 8 x half> %res
@@ -589,9 +584,8 @@ define <vscale x 8 x half> @scvtf_h_nxv8i16(<vscale x 8 x i16> %a) {
define <vscale x 2 x float> @scvtf_s_nxv2i1(<vscale x 2 x i1> %a) {
; CHECK-LABEL: scvtf_s_nxv2i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: fmov z0.s, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x float>
ret <vscale x 2 x float> %res
@@ -620,9 +614,8 @@ define <vscale x 2 x float> @scvtf_s_nxv2i64(<vscale x 2 x i64> %a) {
define <vscale x 3 x float> @scvtf_s_nxv3i1(<vscale x 3 x i1> %a) {
; CHECK-LABEL: scvtf_s_nxv3i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: fmov z0.s, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 3 x i1> %a to <vscale x 3 x float>
ret <vscale x 3 x float> %res
@@ -641,9 +634,8 @@ define <vscale x 3 x float> @scvtf_s_nxv3i32(<vscale x 3 x i32> %a) {
define <vscale x 4 x float> @scvtf_s_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: scvtf_s_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: fmov z0.s, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x float>
ret <vscale x 4 x float> %res
@@ -662,9 +654,8 @@ define <vscale x 4 x float> @scvtf_s_nxv4i32(<vscale x 4 x i32> %a) {
define <vscale x 2 x double> @scvtf_d_nxv2i1(<vscale x 2 x i1> %a) {
; CHECK-LABEL: scvtf_d_nxv2i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: mov z0.d, #0 // =0x0
+; CHECK-NEXT: fmov z0.d, p0/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x double>
ret <vscale x 2 x double> %res
@@ -695,9 +686,8 @@ define <vscale x 2 x double> @scvtf_d_nxv2i64(<vscale x 2 x i64> %a) {
define <vscale x 2 x half> @ucvtf_h_nxv2i1(<vscale x 2 x i1> %a) {
; CHECK-LABEL: ucvtf_h_nxv2i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #1.00000000
; CHECK-NEXT: ret
%res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x half>
ret <vscale x 2 x half> %res
@@ -736,9 +726,8 @@ define <vscale x 2 x half> @ucvtf_h_nxv2i64(<vscale x 2 x i64> %a) {
define <vscale x 3 x half> @ucvtf_h_nxv3i1(<vscale x 3 x i1> %a) {
; CHECK-LABEL: ucvtf_h_nxv3i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #1.00000000
; CHECK-NEXT: ret
%res = uitofp <vscale x 3 x i1> %a to <vscale x 3 x half>
ret <vscale x 3 x half> %res
@@ -767,9 +756,8 @@ define <vscale x 3 x half> @ucvtf_h_nxv3i32(<vscale x 3 x i32> %a) {
define <vscale x 4 x half> @ucvtf_h_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: ucvtf_h_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #1.00000000
; CHECK-NEXT: ret
%res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x half>
ret <vscale x 4 x half> %res
@@ -798,9 +786,8 @@ define <vscale x 4 x half> @ucvtf_h_nxv4i32(<vscale x 4 x i32> %a) {
define <vscale x 8 x half> @ucvtf_h_nxv8i1(<vscale x 8 x i1> %a) {
; CHECK-LABEL: ucvtf_h_nxv8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT: mov z0.h, #0 // =0x0
+; CHECK-NEXT: fmov z0.h, p0/m, #1.00000000
; CHECK-NEXT: ret
%res = uitofp <vscale x 8 x i1> %a to <vscale x 8 x half>
ret <vscale x 8 x half> %res
@@ -819,9 +806,8 @@ define <vscale x 8 x half> @ucvtf_h_nxv8i16(<vscale x 8 x i16> %a) {
define <vscale x 2 x float> @ucvtf_s_nxv2i1(<vscale x 2 x i1> %a) {
; CHECK-LABEL: ucvtf_s_nxv2i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: fmov z0.s, p0/m, #1.00000000
; CHECK-NEXT: ret
%res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x float>
ret <vscale x 2 x float> %res
@@ -850,9 +836,8 @@ define <vscale x 2 x float> @ucvtf_s_nxv2i64(<vscale x 2 x i64> %a) {
define <vscale x 4 x float> @ucvtf_s_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: ucvtf_s_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: fmov z0.s, p0/m, #1.00000000
; CHECK-NEXT: ret
%res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x float>
ret <vscale x 4 x float> %res
@@ -871,9 +856,8 @@ define <vscale x 4 x float> @ucvtf_s_nxv4i32(<vscale x 4 x i32> %a) {
define <vscale x 2 x double> @ucvtf_d_nxv2i1(<vscale x 2 x i1> %a) {
; CHECK-LABEL: ucvtf_d_nxv2i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT: mov z0.d, #0 // =0x0
+; CHECK-NEXT: fmov z0.d, p0/m, #1.00000000
; CHECK-NEXT: ret
%res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x double>
ret <vscale x 2 x double> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
index bc015116917d8..5c84551432909 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -331,13 +331,12 @@ define <vscale x 4 x double> @scvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
define <vscale x 4 x double> @scvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: scvtf_d_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: mov z1.d, #0 // =0x0
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: mov z0.d, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: scvtf z0.d, p1/m, z0.d
-; CHECK-NEXT: scvtf z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: fmov z1.d, p0/m, #-1.00000000
+; CHECK-NEXT: fmov z0.d, p1/m, #-1.00000000
; CHECK-NEXT: ret
%res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x double>
ret <vscale x 4 x double> %res
@@ -393,13 +392,12 @@ define <vscale x 4 x double> @ucvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
define <vscale x 4 x double> @ucvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
; CHECK-LABEL: ucvtf_d_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: mov z1.d, #0 // =0x0
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: mov z0.d, p2/z, #1 // =0x1
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1
-; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d
-; CHECK-NEXT: ucvtf z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: fmov z1.d, p0/m, #1.00000000
+; CHECK-NEXT: fmov z0.d, p1/m, #1.00000000
; CHECK-NEXT: ret
%res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x double>
ret <vscale x 4 x double> %res
|
; CHECK-NEXT: mov z0.h, #0 // =0x0 | ||
; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can do even better here by removing the explicit zeroing. I would prefer to handle this as a follow up PR because it involves more tablegen and my intent with this PR is to clean up the C++ lowering code to make it easier to add the missing bfloat support.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! It brings tears of joy to my eyes.
llvm#129229) Rather than extending the predicate we can use it directly to select between the two possible results.
Rather than extending the predicate we can use it directly to select between the two possible results.