Skip to content

[RISCV] SPLAT_VECTOR of bf16 should not require Zvfhmin. #95357

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1105,7 +1105,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.hasStdExtZfbfmin()) {
if (Subtarget.hasVInstructionsF16())
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
else if (Subtarget.hasVInstructionsF16Minimal())
else
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
}
setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
Expand Down Expand Up @@ -1343,7 +1343,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.hasStdExtZfbfmin()) {
if (Subtarget.hasVInstructionsF16())
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
else if (Subtarget.hasVInstructionsF16Minimal())
else
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
}
setOperationAction(
Expand Down Expand Up @@ -6739,7 +6739,6 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
!Subtarget.hasVInstructionsF16())) ||
(Op.getValueType().getScalarType() == MVT::bf16 &&
(Subtarget.hasVInstructionsBF16() && Subtarget.hasStdExtZfbfmin() &&
Subtarget.hasVInstructionsF16Minimal() &&
!Subtarget.hasVInstructionsF16()))) {
if (Op.getValueType() == MVT::nxv32f16 ||
Op.getValueType() == MVT::nxv32bf16)
Expand Down
125 changes: 125 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-bf16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+experimental-zfbfmin,+experimental-zvfbfmin -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+experimental-zfbfmin,+experimental-zvfbfmin -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s

define <2 x bfloat> @select_v2bf16(i1 zeroext %c, <2 x bfloat> %a, <2 x bfloat> %b) {
; CHECK-LABEL: select_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: vmsne.vi v0, v10, 0
; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%v = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b
ret <2 x bfloat> %v
}

define <2 x bfloat> @selectcc_v2bf16(bfloat %a, bfloat %b, <2 x bfloat> %c, <2 x bfloat> %d) {
; CHECK-LABEL: selectcc_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: fcvt.s.bf16 fa5, fa1
; CHECK-NEXT: fcvt.s.bf16 fa4, fa0
; CHECK-NEXT: feq.s a0, fa4, fa5
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: vmsne.vi v0, v10, 0
; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%cmp = fcmp oeq bfloat %a, %b
%v = select i1 %cmp, <2 x bfloat> %c, <2 x bfloat> %d
ret <2 x bfloat> %v
}

define <4 x bfloat> @select_v4bf16(i1 zeroext %c, <4 x bfloat> %a, <4 x bfloat> %b) {
; CHECK-LABEL: select_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: vmsne.vi v0, v10, 0
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%v = select i1 %c, <4 x bfloat> %a, <4 x bfloat> %b
ret <4 x bfloat> %v
}

define <4 x bfloat> @selectcc_v4bf16(bfloat %a, bfloat %b, <4 x bfloat> %c, <4 x bfloat> %d) {
; CHECK-LABEL: selectcc_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: fcvt.s.bf16 fa5, fa1
; CHECK-NEXT: fcvt.s.bf16 fa4, fa0
; CHECK-NEXT: feq.s a0, fa4, fa5
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: vmsne.vi v0, v10, 0
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%cmp = fcmp oeq bfloat %a, %b
%v = select i1 %cmp, <4 x bfloat> %c, <4 x bfloat> %d
ret <4 x bfloat> %v
}

define <8 x bfloat> @select_v8bf16(i1 zeroext %c, <8 x bfloat> %a, <8 x bfloat> %b) {
; CHECK-LABEL: select_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: vmsne.vi v0, v10, 0
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%v = select i1 %c, <8 x bfloat> %a, <8 x bfloat> %b
ret <8 x bfloat> %v
}

define <8 x bfloat> @selectcc_v8bf16(bfloat %a, bfloat %b, <8 x bfloat> %c, <8 x bfloat> %d) {
; CHECK-LABEL: selectcc_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: fcvt.s.bf16 fa5, fa1
; CHECK-NEXT: fcvt.s.bf16 fa4, fa0
; CHECK-NEXT: feq.s a0, fa4, fa5
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v10, a0
; CHECK-NEXT: vmsne.vi v0, v10, 0
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%cmp = fcmp oeq bfloat %a, %b
%v = select i1 %cmp, <8 x bfloat> %c, <8 x bfloat> %d
ret <8 x bfloat> %v
}

define <16 x bfloat> @select_v16bf16(i1 zeroext %c, <16 x bfloat> %a, <16 x bfloat> %b) {
; CHECK-LABEL: select_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.x v12, a0
; CHECK-NEXT: vmsne.vi v0, v12, 0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: ret
%v = select i1 %c, <16 x bfloat> %a, <16 x bfloat> %b
ret <16 x bfloat> %v
}

define <16 x bfloat> @selectcc_v16bf16(bfloat %a, bfloat %b, <16 x bfloat> %c, <16 x bfloat> %d) {
; CHECK-LABEL: selectcc_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: fcvt.s.bf16 fa5, fa1
; CHECK-NEXT: fcvt.s.bf16 fa4, fa0
; CHECK-NEXT: feq.s a0, fa4, fa5
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.x v12, a0
; CHECK-NEXT: vmsne.vi v0, v12, 0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: ret
%cmp = fcmp oeq bfloat %a, %b
%v = select i1 %cmp, <16 x bfloat> %c, <16 x bfloat> %d
ret <16 x bfloat> %v
}
117 changes: 117 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge-bf16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+m,+experimental-zfbfmin,+experimental-zvfbfmin -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+m,+experimental-zfbfmin,+experimental-zvfbfmin -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s

declare <2 x bfloat> @llvm.vp.merge.v2bf16(<2 x i1>, <2 x bfloat>, <2 x bfloat>, i32)

define <2 x bfloat> @vpmerge_vv_v2bf16(<2 x bfloat> %va, <2 x bfloat> %vb, <2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpmerge_vv_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma
; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%v = call <2 x bfloat> @llvm.vp.merge.v2bf16(<2 x i1> %m, <2 x bfloat> %va, <2 x bfloat> %vb, i32 %evl)
ret <2 x bfloat> %v
}

define <2 x bfloat> @vpmerge_vf_v2bf16(bfloat %a, <2 x bfloat> %vb, <2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpmerge_vf_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa5
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
Comment on lines +23 to +27
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Random idea to improving the splat of bf16 with zvfbfmin only:

fmv.x.w t0, fa0
vsetvli zero, a0, e16, mf4, tu, mu
vmv.v.x v9, t0, v0.t

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could use fmv.x.h instead of fmv.x.w I think. If bf16 scalar type is legal then we must have Zfbfmin.

; CHECK-NEXT: ret
%elt.head = insertelement <2 x bfloat> poison, bfloat %a, i32 0
%va = shufflevector <2 x bfloat> %elt.head, <2 x bfloat> poison, <2 x i32> zeroinitializer
%v = call <2 x bfloat> @llvm.vp.merge.v2bf16(<2 x i1> %m, <2 x bfloat> %va, <2 x bfloat> %vb, i32 %evl)
ret <2 x bfloat> %v
}

declare <4 x bfloat> @llvm.vp.merge.v4bf16(<4 x i1>, <4 x bfloat>, <4 x bfloat>, i32)

define <4 x bfloat> @vpmerge_vv_v4bf16(<4 x bfloat> %va, <4 x bfloat> %vb, <4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpmerge_vv_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma
; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%v = call <4 x bfloat> @llvm.vp.merge.v4bf16(<4 x i1> %m, <4 x bfloat> %va, <4 x bfloat> %vb, i32 %evl)
ret <4 x bfloat> %v
}

define <4 x bfloat> @vpmerge_vf_v4bf16(bfloat %a, <4 x bfloat> %vb, <4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpmerge_vf_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vfmv.v.f v9, fa5
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v9, v0.t
; CHECK-NEXT: ret
%elt.head = insertelement <4 x bfloat> poison, bfloat %a, i32 0
%va = shufflevector <4 x bfloat> %elt.head, <4 x bfloat> poison, <4 x i32> zeroinitializer
%v = call <4 x bfloat> @llvm.vp.merge.v4bf16(<4 x i1> %m, <4 x bfloat> %va, <4 x bfloat> %vb, i32 %evl)
ret <4 x bfloat> %v
}

declare <8 x bfloat> @llvm.vp.merge.v8bf16(<8 x i1>, <8 x bfloat>, <8 x bfloat>, i32)

define <8 x bfloat> @vpmerge_vv_v8bf16(<8 x bfloat> %va, <8 x bfloat> %vb, <8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpmerge_vv_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%v = call <8 x bfloat> @llvm.vp.merge.v8bf16(<8 x i1> %m, <8 x bfloat> %va, <8 x bfloat> %vb, i32 %evl)
ret <8 x bfloat> %v
}

define <8 x bfloat> @vpmerge_vf_v8bf16(bfloat %a, <8 x bfloat> %vb, <8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpmerge_vf_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; CHECK-NEXT: vfmv.v.f v10, fa5
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v10, v0.t
; CHECK-NEXT: ret
%elt.head = insertelement <8 x bfloat> poison, bfloat %a, i32 0
%va = shufflevector <8 x bfloat> %elt.head, <8 x bfloat> poison, <8 x i32> zeroinitializer
%v = call <8 x bfloat> @llvm.vp.merge.v8bf16(<8 x i1> %m, <8 x bfloat> %va, <8 x bfloat> %vb, i32 %evl)
ret <8 x bfloat> %v
}

declare <16 x bfloat> @llvm.vp.merge.v16bf16(<16 x i1>, <16 x bfloat>, <16 x bfloat>, i32)

define <16 x bfloat> @vpmerge_vv_v16bf16(<16 x bfloat> %va, <16 x bfloat> %vb, <16 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpmerge_vv_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma
; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0
; CHECK-NEXT: vmv2r.v v8, v10
; CHECK-NEXT: ret
%v = call <16 x bfloat> @llvm.vp.merge.v16bf16(<16 x i1> %m, <16 x bfloat> %va, <16 x bfloat> %vb, i32 %evl)
ret <16 x bfloat> %v
}

define <16 x bfloat> @vpmerge_vf_v16bf16(bfloat %a, <16 x bfloat> %vb, <16 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpmerge_vf_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: fcvt.s.bf16 fa5, fa0
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; CHECK-NEXT: vfmv.v.f v12, fa5
; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu
; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12, v0.t
; CHECK-NEXT: ret
%elt.head = insertelement <16 x bfloat> poison, bfloat %a, i32 0
%va = shufflevector <16 x bfloat> %elt.head, <16 x bfloat> poison, <16 x i32> zeroinitializer
%v = call <16 x bfloat> @llvm.vp.merge.v16bf16(<16 x i1> %m, <16 x bfloat> %va, <16 x bfloat> %vb, i32 %evl)
ret <16 x bfloat> %v
}
53 changes: 53 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp-bf16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+m,+experimental-zfbfmin,+experimental-zvfbfmin -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+m,+experimental-zfbfmin,+experimental-zvfbfmin -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s

declare <2 x bfloat> @llvm.vp.select.v2bf16(<2 x i1>, <2 x bfloat>, <2 x bfloat>, i32)

define <2 x bfloat> @select_v2bf16(<2 x i1> %a, <2 x bfloat> %b, <2 x bfloat> %c, i32 zeroext %evl) {
; CHECK-LABEL: select_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%v = call <2 x bfloat> @llvm.vp.select.v2bf16(<2 x i1> %a, <2 x bfloat> %b, <2 x bfloat> %c, i32 %evl)
ret <2 x bfloat> %v
}

declare <4 x bfloat> @llvm.vp.select.v4bf16(<4 x i1>, <4 x bfloat>, <4 x bfloat>, i32)

define <4 x bfloat> @select_v4bf16(<4 x i1> %a, <4 x bfloat> %b, <4 x bfloat> %c, i32 zeroext %evl) {
; CHECK-LABEL: select_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%v = call <4 x bfloat> @llvm.vp.select.v4bf16(<4 x i1> %a, <4 x bfloat> %b, <4 x bfloat> %c, i32 %evl)
ret <4 x bfloat> %v
}

declare <8 x bfloat> @llvm.vp.select.v8bf16(<8 x i1>, <8 x bfloat>, <8 x bfloat>, i32)

define <8 x bfloat> @select_v8bf16(<8 x i1> %a, <8 x bfloat> %b, <8 x bfloat> %c, i32 zeroext %evl) {
; CHECK-LABEL: select_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%v = call <8 x bfloat> @llvm.vp.select.v8bf16(<8 x i1> %a, <8 x bfloat> %b, <8 x bfloat> %c, i32 %evl)
ret <8 x bfloat> %v
}

declare <16 x bfloat> @llvm.vp.select.v16bf16(<16 x i1>, <16 x bfloat>, <16 x bfloat>, i32)

define <16 x bfloat> @select_v16bf16(<16 x i1> %a, <16 x bfloat> %b, <16 x bfloat> %c, i32 zeroext %evl) {
; CHECK-LABEL: select_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
; CHECK-NEXT: ret
%v = call <16 x bfloat> @llvm.vp.select.v16bf16(<16 x i1> %a, <16 x bfloat> %b, <16 x bfloat> %c, i32 %evl)
ret <16 x bfloat> %v
}
Loading
Loading