Skip to content

[LLVM][SVE] Improve code generation for i1 based int_to_fp operations. #129229

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 4, 2025

Conversation

paulwalker-arm
Copy link
Collaborator

Rather than extending the predicate we can use it directly to select between the two possible results.

Rather than extending the predicate we can simply use it to select
between the two possible results.
@llvmbot
Copy link
Member

llvmbot commented Feb 28, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Paul Walker (paulwalker-arm)

Changes

Rather than extending the predicate we can use it directly to select between the two possible results.


Full diff: https://github.com/llvm/llvm-project/pull/129229.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+4-5)
  • (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+13)
  • (modified) llvm/test/CodeGen/AArch64/sve-fcvt.ll (+32-48)
  • (modified) llvm/test/CodeGen/AArch64/sve-split-fcvt.ll (+10-12)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7a471662ea075..110a592df2d2d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5034,11 +5034,10 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
 
   if (VT.isScalableVector()) {
     if (InVT.getVectorElementType() == MVT::i1) {
-      // We can't directly extend an SVE predicate; extend it first.
-      unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-      EVT CastVT = getPromotedVTForPredicate(InVT);
-      In = DAG.getNode(CastOpc, dl, CastVT, In);
-      return DAG.getNode(Opc, dl, VT, In);
+      SDValue FalseVal = DAG.getConstantFP(0.0, dl, VT);
+      SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, dl, VT)
+                                 : DAG.getConstantFP(1.0, dl, VT);
+      return DAG.getNode(ISD::VSELECT, dl, VT, In, TrueVal, FalseVal);
     }
 
     unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8255b267bd7e9..8d2e7f4a8ed10 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5452,6 +5452,19 @@ multiclass sve_int_dup_fpimm_pred<string asm> {
                   (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, fpimm32:$imm8), 1>;
   def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, fpimm64:$imm8), 1>;
+
+  def : Pat<(nxv8f16 (vselect nxv8i1:$pg, (splat_vector fpimm16:$imm8), nxv8f16:$zd)),
+            (!cast<Instruction>(NAME # _H) $zd, $pg, fpimm16:$imm8)>;
+  def : Pat<(nxv4f16 (vselect nxv4i1:$pg, (splat_vector fpimm16:$imm8), nxv4f16:$zd)),
+            (!cast<Instruction>(NAME # _H) $zd, $pg, fpimm16:$imm8)>;
+  def : Pat<(nxv2f16 (vselect nxv2i1:$pg, (splat_vector fpimm16:$imm8), nxv2f16:$zd)),
+            (!cast<Instruction>(NAME # _H) $zd, $pg, fpimm16:$imm8)>;
+  def : Pat<(nxv4f32 (vselect nxv4i1:$pg, (splat_vector fpimm32:$imm8), nxv4f32:$zd)),
+            (!cast<Instruction>(NAME # _S) $zd, $pg, fpimm32:$imm8)>;
+  def : Pat<(nxv2f32 (vselect nxv2i1:$pg, (splat_vector fpimm32:$imm8), nxv2f32:$zd)),
+            (!cast<Instruction>(NAME # _S) $zd, $pg, fpimm32:$imm8)>;
+  def : Pat<(nxv2f64 (vselect nxv2i1:$pg, (splat_vector fpimm64:$imm8), nxv2f64:$zd)),
+            (!cast<Instruction>(NAME # _D) $zd, $pg, fpimm64:$imm8)>;
 }
 
 class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
index fc5128fffad36..a6749984af427 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
@@ -454,9 +454,8 @@ define <vscale x 2 x i64> @fcvtzu_d_nxv2f64(<vscale x 2 x double> %a) {
 define <vscale x 2 x half> @scvtf_h_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x half>
   ret <vscale x 2 x half> %res
@@ -495,9 +494,8 @@ define <vscale x 2 x half> @scvtf_h_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 3 x half> @scvtf_h_nxv3i1(<vscale x 3 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv3i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 3 x i1> %a to <vscale x 3 x half>
   ret <vscale x 3 x half> %res
@@ -516,9 +514,8 @@ define <vscale x 3 x half> @scvtf_h_nxv3i16(<vscale x 3 x i16> %a) {
 define <vscale x 4 x half> @scvtf_h_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x half>
   ret <vscale x 4 x half> %res
@@ -547,9 +544,8 @@ define <vscale x 4 x half> @scvtf_h_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 7 x half> @scvtf_h_nxv7i1(<vscale x 7 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv7i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 7 x i1> %a to <vscale x 7 x half>
   ret <vscale x 7 x half> %res
@@ -568,9 +564,8 @@ define <vscale x 7 x half> @scvtf_h_nxv7i16(<vscale x 7 x i16> %a) {
 define <vscale x 8 x half> @scvtf_h_nxv8i1(<vscale x 8 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 8 x i1> %a to <vscale x 8 x half>
   ret <vscale x 8 x half> %res
@@ -589,9 +584,8 @@ define <vscale x 8 x half> @scvtf_h_nxv8i16(<vscale x 8 x i16> %a) {
 define <vscale x 2 x float> @scvtf_s_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: scvtf_s_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    fmov z0.s, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x float>
   ret <vscale x 2 x float> %res
@@ -620,9 +614,8 @@ define <vscale x 2 x float> @scvtf_s_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 3 x float> @scvtf_s_nxv3i1(<vscale x 3 x i1> %a) {
 ; CHECK-LABEL: scvtf_s_nxv3i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    fmov z0.s, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 3 x i1> %a to <vscale x 3 x float>
   ret <vscale x 3 x float> %res
@@ -641,9 +634,8 @@ define <vscale x 3 x float> @scvtf_s_nxv3i32(<vscale x 3 x i32> %a) {
 define <vscale x 4 x float> @scvtf_s_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: scvtf_s_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    fmov z0.s, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x float>
   ret <vscale x 4 x float> %res
@@ -662,9 +654,8 @@ define <vscale x 4 x float> @scvtf_s_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 2 x double> @scvtf_d_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: scvtf_d_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    fmov z0.d, p0/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x double>
   ret <vscale x 2 x double> %res
@@ -695,9 +686,8 @@ define <vscale x 2 x double> @scvtf_d_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x half> @ucvtf_h_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: ucvtf_h_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #1.00000000
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x half>
   ret <vscale x 2 x half> %res
@@ -736,9 +726,8 @@ define <vscale x 2 x half> @ucvtf_h_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 3 x half> @ucvtf_h_nxv3i1(<vscale x 3 x i1> %a) {
 ; CHECK-LABEL: ucvtf_h_nxv3i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #1.00000000
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 3 x i1> %a to <vscale x 3 x half>
   ret <vscale x 3 x half> %res
@@ -767,9 +756,8 @@ define <vscale x 3 x half> @ucvtf_h_nxv3i32(<vscale x 3 x i32> %a) {
 define <vscale x 4 x half> @ucvtf_h_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: ucvtf_h_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #1.00000000
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x half>
   ret <vscale x 4 x half> %res
@@ -798,9 +786,8 @@ define <vscale x 4 x half> @ucvtf_h_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 8 x half> @ucvtf_h_nxv8i1(<vscale x 8 x i1> %a) {
 ; CHECK-LABEL: ucvtf_h_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fmov z0.h, p0/m, #1.00000000
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 8 x i1> %a to <vscale x 8 x half>
   ret <vscale x 8 x half> %res
@@ -819,9 +806,8 @@ define <vscale x 8 x half> @ucvtf_h_nxv8i16(<vscale x 8 x i16> %a) {
 define <vscale x 2 x float> @ucvtf_s_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: ucvtf_s_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    fmov z0.s, p0/m, #1.00000000
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x float>
   ret <vscale x 2 x float> %res
@@ -850,9 +836,8 @@ define <vscale x 2 x float> @ucvtf_s_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 4 x float> @ucvtf_s_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: ucvtf_s_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    fmov z0.s, p0/m, #1.00000000
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x float>
   ret <vscale x 4 x float> %res
@@ -871,9 +856,8 @@ define <vscale x 4 x float> @ucvtf_s_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 2 x double> @ucvtf_d_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: ucvtf_d_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    fmov z0.d, p0/m, #1.00000000
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x double>
   ret <vscale x 2 x double> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
index bc015116917d8..5c84551432909 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -331,13 +331,12 @@ define <vscale x 4 x double> @scvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x double> @scvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: scvtf_d_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    mov z0.d, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.d, p1/m, z0.d
-; CHECK-NEXT:    scvtf z1.d, p1/m, z1.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmov z1.d, p0/m, #-1.00000000
+; CHECK-NEXT:    fmov z0.d, p1/m, #-1.00000000
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x double>
   ret <vscale x 4 x double> %res
@@ -393,13 +392,12 @@ define <vscale x 4 x double> @ucvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x double> @ucvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: ucvtf_d_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    mov z0.d, p2/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ucvtf z0.d, p1/m, z0.d
-; CHECK-NEXT:    ucvtf z1.d, p1/m, z1.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    fmov z1.d, p0/m, #1.00000000
+; CHECK-NEXT:    fmov z0.d, p1/m, #1.00000000
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x double>
   ret <vscale x 4 x double> %res

Comment on lines +457 to +458
; CHECK-NEXT: mov z0.h, #0 // =0x0
; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000
Copy link
Collaborator Author

@paulwalker-arm paulwalker-arm Feb 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do even better here by removing the explicit zeroing. I would prefer to handle this as a follow up PR because it involves more tablegen and my intent with this PR is to clean up the C++ lowering code to make it easier to add the missing bfloat support.

Copy link
Contributor

@david-arm david-arm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM! It brings tears of joy to my eyes.

@paulwalker-arm paulwalker-arm merged commit 1e1781b into llvm:main Mar 4, 2025
13 checks passed
@paulwalker-arm paulwalker-arm deleted the sve-itof-i1 branch March 4, 2025 11:22
jph-13 pushed a commit to jph-13/llvm-project that referenced this pull request Mar 21, 2025
llvm#129229)

Rather than extending the predicate we can use it directly to select
between the two possible results.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants