[LLVM][SVE] Implement isel for bfloat fptoi and itofp operations. #129713

paulwalker-arm · 2025-03-04T14:23:07Z

NOTE: This PR only considers scalable vectors because SVE VLS does not support bfloat (see useSVEForFixedLengthVectorVT()).

llvmbot · 2025-03-04T14:23:45Z

@llvm/pr-subscribers-backend-aarch64

Author: Paul Walker (paulwalker-arm)

Changes

NOTE: This PR only considers scalable vectors because SVE VLS does not support bfloat (see useSVEForFixedLengthVectorVT()).

Patch is 35.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129713.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+45-34)
(modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+8)
(added) llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll (+816)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2dca8c0da4756..70b229294b920 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4582,6 +4582,10 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
   bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
 
   if (VT.isScalableVector()) {
+    // Let common code split the operation.
+    if (SrcVT == MVT::nxv8f32)
+      return Op;
+
     if (VT.getScalarType() != MVT::bf16)
       return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
 
@@ -4724,6 +4728,22 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   assert(!(IsStrict && VT.isScalableVector()) &&
          "Unimplemented SVE support for STRICT_FP_to_INT!");
 
+  // f16 conversions are promoted to f32 when full fp16 is not supported.
+  if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
+      InVT.getVectorElementType() == MVT::bf16) {
+    EVT NewVT = VT.changeElementType(MVT::f32);
+    SDLoc dl(Op);
+    if (IsStrict) {
+      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
+                                {Op.getOperand(0), Op.getOperand(1)});
+      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+                         {Ext.getValue(1), Ext.getValue(0)});
+    }
+    return DAG.getNode(
+        Op.getOpcode(), dl, Op.getValueType(),
+        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+  }
+
   if (VT.isScalableVector()) {
     if (VT.getVectorElementType() == MVT::i1) {
       SDLoc DL(Op);
@@ -4733,6 +4753,10 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
       return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
     }
 
+    // Let common code split the operation.
+    if (InVT == MVT::nxv8f32)
+      return Op;
+
     unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
                           ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
                           : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
@@ -4743,24 +4767,6 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
       useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
     return LowerFixedLengthFPToIntToSVE(Op, DAG);
 
-  unsigned NumElts = InVT.getVectorNumElements();
-
-  // f16 conversions are promoted to f32 when full fp16 is not supported.
-  if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
-      InVT.getVectorElementType() == MVT::bf16) {
-    MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
-    SDLoc dl(Op);
-    if (IsStrict) {
-      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
-                                {Op.getOperand(0), Op.getOperand(1)});
-      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
-                         {Ext.getValue(1), Ext.getValue(0)});
-    }
-    return DAG.getNode(
-        Op.getOpcode(), dl, Op.getValueType(),
-        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
-  }
-
   uint64_t VTSize = VT.getFixedSizeInBits();
   uint64_t InVTSize = InVT.getFixedSizeInBits();
   if (VTSize < InVTSize) {
@@ -4795,7 +4801,7 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
 
   // Use a scalar operation for conversions between single-element vectors of
   // the same size.
-  if (NumElts == 1) {
+  if (InVT.getVectorNumElements() == 1) {
     SDLoc dl(Op);
     SDValue Extract = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
@@ -5041,23 +5047,14 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
   assert(!(IsStrict && VT.isScalableVector()) &&
          "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
 
-  if (VT.isScalableVector()) {
-    if (InVT.getVectorElementType() == MVT::i1) {
-      SDValue FalseVal = DAG.getConstantFP(0.0, dl, VT);
-      SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, dl, VT)
-                                 : DAG.getConstantFP(1.0, dl, VT);
-      return DAG.getNode(ISD::VSELECT, dl, VT, In, TrueVal, FalseVal);
-    }
-
-    unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
-                               : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
-    return LowerToPredicatedOp(Op, DAG, Opcode);
+  // NOTE: i1->bf16 does not require promotion to f32.
+  if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
+    SDValue FalseVal = DAG.getConstantFP(0.0, dl, VT);
+    SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, dl, VT)
+                               : DAG.getConstantFP(1.0, dl, VT);
+    return DAG.getNode(ISD::VSELECT, dl, VT, In, TrueVal, FalseVal);
   }
 
-  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
-      useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
-    return LowerFixedLengthIntToFPToSVE(Op, DAG);
-
   // Promote bf16 conversions to f32.
   if (VT.getVectorElementType() == MVT::bf16) {
     EVT F32 = VT.changeElementType(MVT::f32);
@@ -5074,6 +5071,20 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
   }
 
+  if (VT.isScalableVector()) {
+    // Let common code split the operation.
+    if (VT == MVT::nxv8f32)
+      return Op;
+
+    unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
+                               : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
+    return LowerToPredicatedOp(Op, DAG, Opcode);
+  }
+
+  if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
+      useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
+    return LowerFixedLengthIntToFPToSVE(Op, DAG);
+
   uint64_t VTSize = VT.getFixedSizeInBits();
   uint64_t InVTSize = InVT.getFixedSizeInBits();
   if (VTSize < InVTSize) {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8d2e7f4a8ed10..eafaf1717902e 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5465,6 +5465,14 @@ multiclass sve_int_dup_fpimm_pred<string asm> {
             (!cast<Instruction>(NAME # _S) $zd, $pg, fpimm32:$imm8)>;
   def : Pat<(nxv2f64 (vselect nxv2i1:$pg, (splat_vector fpimm64:$imm8), nxv2f64:$zd)),
             (!cast<Instruction>(NAME # _D) $zd, $pg, fpimm64:$imm8)>;
+
+  // Some half precision immediates alias with bfloat (e.g. f16(1.875) == bf16(1.0)).
+  def : Pat<(nxv8bf16 (vselect nxv8i1:$pg, (splat_vector fpimmbf16:$imm8), nxv8bf16:$zd)),
+            (!cast<Instruction>(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>;
+  def : Pat<(nxv4bf16 (vselect nxv4i1:$pg, (splat_vector fpimmbf16:$imm8), nxv4bf16:$zd)),
+            (!cast<Instruction>(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>;
+  def : Pat<(nxv2bf16 (vselect nxv2i1:$pg, (splat_vector fpimmbf16:$imm8), nxv2bf16:$zd)),
+            (!cast<Instruction>(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>;
 }
 
 class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll
new file mode 100644
index 0000000000000..d6484c2483f49
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll
@@ -0,0 +1,816 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16            < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 2 x i1> @fptosi_nxv2bf16_to_nxv2i1(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i1>
+  ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 2 x i8> @fptosi_nxv2bf16_to_nxv2i8(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i8>
+  ret <vscale x 2 x i8> %res
+}
+
+define <vscale x 2 x i16> @fptosi_nxv2bf16_to_nxv2i16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i16>
+  ret <vscale x 2 x i16> %res
+}
+
+define <vscale x 2 x i32> @fptosi_nxv2bf16_to_nxv2i32(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i32>
+  ret <vscale x 2 x i32> %res
+}
+
+define <vscale x 2 x i64> @fptosi_nxv2bf16_to_nxv2i64(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 2 x bfloat> %a to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x i1> @fptosi_nxv4bf16_to_nxv4i1(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i1>
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 4 x i8> @fptosi_nxv4bf16_to_nxv4i8(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i8>
+  ret <vscale x 4 x i8> %res
+}
+
+define <vscale x 4 x i16> @fptosi_nxv4bf16_to_nxv4i16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i16>
+  ret <vscale x 4 x i16> %res
+}
+
+define <vscale x 4 x i32> @fptosi_nxv4bf16_to_nxv4i32(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i64> @fptosi_nxv4bf16_to_nxv4i64(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z2.s, z0.s, #16
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.s
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 4 x bfloat> %a to <vscale x 4 x i64>
+  ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 8 x i1> @fptosi_nxv8bf16_to_nxv8i1(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i1>
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 8 x i8> @fptosi_nxv8bf16_to_nxv8i8(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i8>
+  ret <vscale x 8 x i8> %res
+}
+
+define <vscale x 8 x i16> @fptosi_nxv8bf16_to_nxv8i16(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i16>
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 8 x i32> @fptosi_nxv8bf16_to_nxv8i32(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z2.s, z0.s, #16
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z1.s
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i32>
+  ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x i64> @fptosi_nxv8bf16_to_nxv8i64(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z2.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z2.s, z2.s, #16
+; CHECK-NEXT:    lsl z3.s, z3.s, #16
+; CHECK-NEXT:    lsl z4.s, z0.s, #16
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z2.s
+; CHECK-NEXT:    movprfx z2, z3
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z3.s
+; CHECK-NEXT:    movprfx z3, z4
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z4.s
+; CHECK-NEXT:    ret
+  %res = fptosi <vscale x 8 x bfloat> %a to <vscale x 8 x i64>
+  ret <vscale x 8 x i64> %res
+}
+
+define <vscale x 2 x i1> @fptoui_nxv2bf16_to_nxv2i1(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i1>
+  ret <vscale x 2 x i1> %res
+}
+
+; NOTE: Using fcvtzs is safe as fptoui overflow is considered poison and a
+; 64bit signed value encompasses the entire range of a 16bit unsigned value.
+define <vscale x 2 x i8> @fptoui_nxv2bf16_to_nxv2i8(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i8>
+  ret <vscale x 2 x i8> %res
+}
+
+define <vscale x 2 x i16> @fptoui_nxv2bf16_to_nxv2i16(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i16>
+  ret <vscale x 2 x i16> %res
+}
+
+define <vscale x 2 x i32> @fptoui_nxv2bf16_to_nxv2i32(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i32>
+  ret <vscale x 2 x i32> %res
+}
+
+define <vscale x 2 x i64> @fptoui_nxv2bf16_to_nxv2i64(<vscale x 2 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 2 x bfloat> %a to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x i1> @fptoui_nxv4bf16_to_nxv4i1(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i1>
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 4 x i8> @fptoui_nxv4bf16_to_nxv4i8(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i8>
+  ret <vscale x 4 x i8> %res
+}
+
+define <vscale x 4 x i16> @fptoui_nxv4bf16_to_nxv4i16(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i16>
+  ret <vscale x 4 x i16> %res
+}
+
+define <vscale x 4 x i32> @fptoui_nxv4bf16_to_nxv4i32(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i64> @fptoui_nxv4bf16_to_nxv4i64(<vscale x 4 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z2.s, z0.s, #16
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.s
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 4 x bfloat> %a to <vscale x 4 x i64>
+  ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 8 x i1> @fptoui_nxv8bf16_to_nxv8i1(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ret
+  %res = fptoui <vscale x 8 x bfloat> %a to <vscale x 8 x i1>
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 8 x i8> @fptoui_nxv8bf16_to_nxv8i8(<vscale x 8 x bfloat> %a) {
+; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl z1.s, z1.s, #16
+; CHECK-NEXT:    lsl z0.s, z0.s, #16
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
+; ...
[truncated]

david-arm · 2025-03-10T14:52:45Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -4733,6 +4753,10 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
      return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
    }

+    // Let common code split the operation.


I assume the reason you've put this after the i1 case above is because you believe that when converting nxv8bf16 -> nxv8i1 it's better to do:

FP_EXTEND: nvx8bf16 -> nxv8f32
VectorFP_TO_INT: nxv8f32 -> nxv8i32
SETNE: nxv8f32, zero -> nxv8i1

than

FP_EXTEND: nvx8bf16 -> nxv8f32
VectorFP_TO_INT: nxv8f32 -> nxv8i1

Presumably because you think SETNE will do a better job of splitting with an i1 result element type, than VectorFP_TO_INT?

Not really. I only put the bail out code here because it's the next blob of code that definitely doesn't support MVT::nxv8f32. When I move it before the i1 handling the output changes thusly:

; CHECK-NEXT: lsl z0.s, z0.s, #16 ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h

Looking at the Neoverse SWOG the two compares of the new output look like they'll be serialised and so I might have hit the better output by fluke rather than judgement?

david-arm · 2025-03-10T15:01:42Z

llvm/lib/Target/AArch64/SVEInstrFormats.td

@@ -5465,6 +5465,14 @@ multiclass sve_int_dup_fpimm_pred<string asm> {
            (!cast<Instruction>(NAME # _S) $zd, $pg, fpimm32:$imm8)>;
  def : Pat<(nxv2f64 (vselect nxv2i1:$pg, (splat_vector fpimm64:$imm8), nxv2f64:$zd)),
            (!cast<Instruction>(NAME # _D) $zd, $pg, fpimm64:$imm8)>;
+
+  // Some half precision immediates alias with bfloat (e.g. f16(1.875) == bf16(1.0)).


This comment implies that some don't, so what happens if fpimmbf16 matches a value that the fp16 variant doesn't have? Or should the comment actually be something like All fpimmbf16 immediates alias with a FP16 immediate?

Not sure I understand. The comment is saying "some" half precision immediates alias with bfloat and the way this is achieved is by using the fpimmbf16 complex pattern that will only let the safe ones through.

All fpimmbf16 immediates alias with a FP16 immediate is obvious from its use because that's how isel works. I can remove the comment if you feel it offers no value? I only added it just in case somebody wondered why we have bfloat patterns for an instruction that doesn't really support bfloat.

david-arm · 2025-03-10T15:08:24Z

llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll

+}
+
+; NOTE: f16(1.875) == bf16(1.0)
+define <vscale x 2 x bfloat> @uitofp_nxv2i1_to_nxv2bf16(<vscale x 2 x i1> %a) {


Is it worth having at least one test for unusual types, such as nxv8i2 -> nxv8bf16? I imagine it should just get promoted and work.

With the promotion not related to bfloat I figured it was already well tested? I'm happy to add them if you disagree.

david-arm

LGTM!

[LLVM][SVE] Implement isel for bfloat fptoi & itofp operations.

a7dd1c1

NOTE: This PR only considers scalable vectors because SVE VLS does not support bfloat (see useSVEForFixedLengthVectorVT()).

paulwalker-arm requested review from huntergr-arm and david-arm March 4, 2025 14:23

llvmbot added the backend:AArch64 label Mar 4, 2025

paulwalker-arm mentioned this pull request Mar 5, 2025

JAX tests fail with "cpu" backend on aarch64 openxla/xla#22473

Closed

david-arm reviewed Mar 10, 2025

View reviewed changes

david-arm approved these changes Mar 18, 2025

View reviewed changes

paulwalker-arm merged commit 6c773a8 into llvm:main Mar 19, 2025
13 checks passed

paulwalker-arm deleted the sve-bfloat-int-conversions branch March 19, 2025 11:51

paulwalker-arm mentioned this pull request Mar 20, 2025

AArch64 Instruction Selection crashed: undef: nxv4bf16 #132161

Closed

tyb0807 mentioned this pull request May 1, 2025

Re-enable SVE on Aarch64 backend openxla/xla#25806

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[LLVM][SVE] Implement isel for bfloat fptoi and itofp operations. #129713

[LLVM][SVE] Implement isel for bfloat fptoi and itofp operations. #129713

Uh oh!

paulwalker-arm commented Mar 4, 2025

Uh oh!

llvmbot commented Mar 4, 2025

Uh oh!

david-arm Mar 10, 2025

Uh oh!

paulwalker-arm Mar 10, 2025

Uh oh!

david-arm Mar 10, 2025

Uh oh!

paulwalker-arm Mar 10, 2025

Uh oh!

david-arm Mar 10, 2025

Uh oh!

paulwalker-arm Mar 10, 2025

Uh oh!

david-arm left a comment

Uh oh!

Uh oh!

Uh oh!

[LLVM][SVE] Implement isel for bfloat fptoi and itofp operations. #129713

[LLVM][SVE] Implement isel for bfloat fptoi and itofp operations. #129713

Uh oh!

Conversation

paulwalker-arm commented Mar 4, 2025

Uh oh!

llvmbot commented Mar 4, 2025

Uh oh!

david-arm Mar 10, 2025

Choose a reason for hiding this comment

Uh oh!

paulwalker-arm Mar 10, 2025

Choose a reason for hiding this comment

Uh oh!

david-arm Mar 10, 2025

Choose a reason for hiding this comment

Uh oh!

paulwalker-arm Mar 10, 2025

Choose a reason for hiding this comment

Uh oh!

david-arm Mar 10, 2025

Choose a reason for hiding this comment

Uh oh!

paulwalker-arm Mar 10, 2025

Choose a reason for hiding this comment

Uh oh!

david-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!