-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Bitcast fixed length bf16/f16 build_vector to i16 with Zvfbfmin/Zvfhmin+Zfbfmin/Zfhmin. #106637
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…in/Zvfhmin+Zfbfmin/Zfhmin. Previously we only handled build_vectors that could be turned into splat_vectors. And we promoted them to f32 splats by extending in the scalar domain and narrowing in the vector domain. This fixes a crash where we failed to account for whether the f32 vector type fit in LMUL<=8. Because this occurs after type legalization, we have to be careful to use XLenVT for the scalar integer type and use custom cast nodes.
@llvm/pr-subscribers-backend-risc-v Author: Craig Topper (topperc) ChangesPreviously we only handled build_vectors that could be turned into splat_vectors. And we promoted them to f32 splats by extending in the scalar domain and narrowing in the vector domain. This patch fixes a crash where we failed to account for whether the f32 vector type fit in LMUL<=8. Because the new lowering occurs after type legalization, we have to be careful to use XLenVT for the scalar integer type and use custom cast nodes. Patch is 239.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106637.diff 17 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 09928dcc1f489a..f11edc34dfbf5a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1255,6 +1255,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (VT.getVectorElementType() == MVT::f16 &&
!Subtarget.hasVInstructionsF16()) {
+ setOperationAction(ISD::BITCAST, VT, Custom);
setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
setOperationAction(
{ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
@@ -1264,8 +1265,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
if (Subtarget.hasStdExtZfhmin()) {
- // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
} else {
// We need to custom legalize f16 build vectors if Zfhmin isn't
// available.
@@ -1283,10 +1283,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
if (VT.getVectorElementType() == MVT::bf16) {
+ setOperationAction(ISD::BITCAST, VT, Custom);
setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
if (Subtarget.hasStdExtZfbfmin()) {
- // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
- setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
} else {
// We need to custom legalize bf16 build vectors if Zfbfmin isn't
// available.
@@ -3924,26 +3924,46 @@ static SDValue lowerBuildVectorViaPacking(SDValue Op, SelectionDAG &DAG,
DAG.getBuildVector(WideVecVT, DL, NewOperands));
}
-// Convert to an vXf16 build_vector to vXi16 with bitcasts.
-static SDValue lowerBUILD_VECTORvXf16(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
- MVT IVT = VT.changeVectorElementType(MVT::i16);
- SmallVector<SDValue, 16> NewOps(Op.getNumOperands());
- for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
- NewOps[I] = DAG.getBitcast(MVT::i16, Op.getOperand(I));
- SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), IVT, NewOps);
- return DAG.getBitcast(VT, Res);
-}
-
static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
assert(VT.isFixedLengthVector() && "Unexpected vector!");
- // If we don't have scalar f16/bf16, we need to bitcast to an i16 vector.
- if ((VT.getVectorElementType() == MVT::f16 && !Subtarget.hasStdExtZfhmin()) ||
- (VT.getVectorElementType() == MVT::bf16 && !Subtarget.hasStdExtZfbfmin()))
- return lowerBUILD_VECTORvXf16(Op, DAG);
+ MVT EltVT = VT.getVectorElementType();
+ MVT XLenVT = Subtarget.getXLenVT();
+
+ SDLoc DL(Op);
+
+ // Proper support for f16 requires Zvfh. bf16 always requires special
+ // handling. We need to cast the scalar to integer and create an integer
+ // build_vector.
+ if ((EltVT == MVT::f16 && !Subtarget.hasStdExtZvfh()) || EltVT == MVT::bf16) {
+ MVT IVT = VT.changeVectorElementType(MVT::i16);
+ SmallVector<SDValue, 16> NewOps(Op.getNumOperands());
+ for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
+ SDValue Elem = Op.getOperand(I);
+ if ((EltVT == MVT::bf16 && Subtarget.hasStdExtZfbfmin()) ||
+ (EltVT == MVT::f16 && Subtarget.hasStdExtZfhmin())) {
+ // Called by LegalizeDAG, we need to use XLenVT operations since we
+ // can't create illegal types.
+ if (auto *C = dyn_cast<ConstantFPSDNode>(Elem)) {
+ // Manually constant fold.
+ // FIXME: Add a constant fold combine for FMV_X_ANYEXTH.
+ // FIXME: We need a load+FMV_X_ANYEXTH combine too.
+ APInt V =
+ C->getValueAPF().bitcastToAPInt().sext(XLenVT.getSizeInBits());
+ NewOps[I] = DAG.getConstant(V, DL, XLenVT);
+ } else {
+ NewOps[I] = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, XLenVT, Elem);
+ }
+ } else {
+ // Called by scalar type legalizer, we can use i16.
+ NewOps[I] = DAG.getBitcast(MVT::i16, Op.getOperand(I));
+ }
+ }
+ SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, IVT, NewOps);
+ return DAG.getBitcast(VT, Res);
+ }
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
@@ -3951,11 +3971,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
- SDLoc DL(Op);
auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
- MVT XLenVT = Subtarget.getXLenVT();
-
if (VT.getVectorElementType() == MVT::i1) {
// A BUILD_VECTOR can be lowered as a SETCC. For each fixed-length mask
// vector type, we have a legal equivalently-sized i8 type, so we can use
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
new file mode 100644
index 00000000000000..170e71af09b49d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec-bf16.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFBFMIN,RV32-NO-ZFBFMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFBFMIN,RV64-NO-ZFBFMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFBFMIN,RV32-ZFBFMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfbfmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFBFMIN,RV64-ZFBFMIN
+
+define <4 x bfloat> @splat_idx_v4bf16(<4 x bfloat> %v, i64 %idx) {
+; RV32-NO-ZFBFMIN-LABEL: splat_idx_v4bf16:
+; RV32-NO-ZFBFMIN: # %bb.0:
+; RV32-NO-ZFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NO-ZFBFMIN-NEXT: vrgather.vx v9, v8, a0
+; RV32-NO-ZFBFMIN-NEXT: vmv1r.v v8, v9
+; RV32-NO-ZFBFMIN-NEXT: ret
+;
+; RV64-NO-ZFBFMIN-LABEL: splat_idx_v4bf16:
+; RV64-NO-ZFBFMIN: # %bb.0:
+; RV64-NO-ZFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NO-ZFBFMIN-NEXT: vrgather.vx v9, v8, a0
+; RV64-NO-ZFBFMIN-NEXT: vmv1r.v v8, v9
+; RV64-NO-ZFBFMIN-NEXT: ret
+;
+; RV32-ZFBFMIN-LABEL: splat_idx_v4bf16:
+; RV32-ZFBFMIN: # %bb.0:
+; RV32-ZFBFMIN-NEXT: addi sp, sp, -48
+; RV32-ZFBFMIN-NEXT: .cfi_def_cfa_offset 48
+; RV32-ZFBFMIN-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-ZFBFMIN-NEXT: .cfi_offset ra, -4
+; RV32-ZFBFMIN-NEXT: csrr a1, vlenb
+; RV32-ZFBFMIN-NEXT: slli a1, a1, 1
+; RV32-ZFBFMIN-NEXT: sub sp, sp, a1
+; RV32-ZFBFMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV32-ZFBFMIN-NEXT: addi a1, sp, 32
+; RV32-ZFBFMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-ZFBFMIN-NEXT: andi a0, a0, 3
+; RV32-ZFBFMIN-NEXT: li a1, 2
+; RV32-ZFBFMIN-NEXT: call __mulsi3
+; RV32-ZFBFMIN-NEXT: addi a1, sp, 16
+; RV32-ZFBFMIN-NEXT: add a0, a1, a0
+; RV32-ZFBFMIN-NEXT: addi a2, sp, 32
+; RV32-ZFBFMIN-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-ZFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-ZFBFMIN-NEXT: vse16.v v8, (a1)
+; RV32-ZFBFMIN-NEXT: flh fa5, 0(a0)
+; RV32-ZFBFMIN-NEXT: fmv.x.h a0, fa5
+; RV32-ZFBFMIN-NEXT: vmv.v.x v8, a0
+; RV32-ZFBFMIN-NEXT: csrr a0, vlenb
+; RV32-ZFBFMIN-NEXT: slli a0, a0, 1
+; RV32-ZFBFMIN-NEXT: add sp, sp, a0
+; RV32-ZFBFMIN-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-ZFBFMIN-NEXT: addi sp, sp, 48
+; RV32-ZFBFMIN-NEXT: ret
+;
+; RV64-ZFBFMIN-LABEL: splat_idx_v4bf16:
+; RV64-ZFBFMIN: # %bb.0:
+; RV64-ZFBFMIN-NEXT: addi sp, sp, -48
+; RV64-ZFBFMIN-NEXT: .cfi_def_cfa_offset 48
+; RV64-ZFBFMIN-NEXT: sd ra, 40(sp) # 8-byte Folded Spill
+; RV64-ZFBFMIN-NEXT: .cfi_offset ra, -8
+; RV64-ZFBFMIN-NEXT: csrr a1, vlenb
+; RV64-ZFBFMIN-NEXT: slli a1, a1, 1
+; RV64-ZFBFMIN-NEXT: sub sp, sp, a1
+; RV64-ZFBFMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV64-ZFBFMIN-NEXT: addi a1, sp, 32
+; RV64-ZFBFMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-ZFBFMIN-NEXT: andi a0, a0, 3
+; RV64-ZFBFMIN-NEXT: li a1, 2
+; RV64-ZFBFMIN-NEXT: call __muldi3
+; RV64-ZFBFMIN-NEXT: addi a1, sp, 16
+; RV64-ZFBFMIN-NEXT: add a0, a1, a0
+; RV64-ZFBFMIN-NEXT: addi a2, sp, 32
+; RV64-ZFBFMIN-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-ZFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-ZFBFMIN-NEXT: vse16.v v8, (a1)
+; RV64-ZFBFMIN-NEXT: flh fa5, 0(a0)
+; RV64-ZFBFMIN-NEXT: fmv.x.h a0, fa5
+; RV64-ZFBFMIN-NEXT: vmv.v.x v8, a0
+; RV64-ZFBFMIN-NEXT: csrr a0, vlenb
+; RV64-ZFBFMIN-NEXT: slli a0, a0, 1
+; RV64-ZFBFMIN-NEXT: add sp, sp, a0
+; RV64-ZFBFMIN-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
+; RV64-ZFBFMIN-NEXT: addi sp, sp, 48
+; RV64-ZFBFMIN-NEXT: ret
+ %x = extractelement <4 x bfloat> %v, i64 %idx
+ %ins = insertelement <4 x bfloat> poison, bfloat %x, i32 0
+ %splat = shufflevector <4 x bfloat> %ins, <4 x bfloat> poison, <4 x i32> zeroinitializer
+ ret <4 x bfloat> %splat
+}
+
+define <2 x bfloat> @buildvec_v2bf16(bfloat %a, bfloat %b) {
+; RV32-NO-ZFBFMIN-LABEL: buildvec_v2bf16:
+; RV32-NO-ZFBFMIN: # %bb.0:
+; RV32-NO-ZFBFMIN-NEXT: fmv.x.w a0, fa1
+; RV32-NO-ZFBFMIN-NEXT: fmv.x.w a1, fa0
+; RV32-NO-ZFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NO-ZFBFMIN-NEXT: vmv.v.x v8, a1
+; RV32-NO-ZFBFMIN-NEXT: vslide1down.vx v8, v8, a0
+; RV32-NO-ZFBFMIN-NEXT: ret
+;
+; RV64-NO-ZFBFMIN-LABEL: buildvec_v2bf16:
+; RV64-NO-ZFBFMIN: # %bb.0:
+; RV64-NO-ZFBFMIN-NEXT: fmv.x.w a0, fa1
+; RV64-NO-ZFBFMIN-NEXT: fmv.x.w a1, fa0
+; RV64-NO-ZFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NO-ZFBFMIN-NEXT: vmv.v.x v8, a1
+; RV64-NO-ZFBFMIN-NEXT: vslide1down.vx v8, v8, a0
+; RV64-NO-ZFBFMIN-NEXT: ret
+;
+; RV32-ZFBFMIN-LABEL: buildvec_v2bf16:
+; RV32-ZFBFMIN: # %bb.0:
+; RV32-ZFBFMIN-NEXT: fmv.x.h a0, fa1
+; RV32-ZFBFMIN-NEXT: fmv.x.h a1, fa0
+; RV32-ZFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-ZFBFMIN-NEXT: vmv.v.x v8, a1
+; RV32-ZFBFMIN-NEXT: vslide1down.vx v8, v8, a0
+; RV32-ZFBFMIN-NEXT: ret
+;
+; RV64-ZFBFMIN-LABEL: buildvec_v2bf16:
+; RV64-ZFBFMIN: # %bb.0:
+; RV64-ZFBFMIN-NEXT: fmv.x.h a0, fa1
+; RV64-ZFBFMIN-NEXT: fmv.x.h a1, fa0
+; RV64-ZFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-ZFBFMIN-NEXT: vmv.v.x v8, a1
+; RV64-ZFBFMIN-NEXT: vslide1down.vx v8, v8, a0
+; RV64-ZFBFMIN-NEXT: ret
+ %v1 = insertelement <2 x bfloat> poison, bfloat %a, i64 0
+ %v2 = insertelement <2 x bfloat> %v1, bfloat %b, i64 1
+ ret <2 x bfloat> %v2
+}
+
+define <2 x bfloat> @vid_v2bf16() {
+; CHECK-LABEL: vid_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, 260096
+; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
+; CHECK-NEXT: ret
+ ret <2 x bfloat> <bfloat 0.0, bfloat 1.0>
+}
+
+define <2 x bfloat> @vid_addend1_v2bf16() {
+; CHECK-LABEL: vid_addend1_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, 262148
+; CHECK-NEXT: addi a0, a0, -128
+; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
+; CHECK-NEXT: ret
+ ret <2 x bfloat> <bfloat 1.0, bfloat 2.0>
+}
+
+define <2 x bfloat> @vid_denominator2_v2bf16() {
+; CHECK-LABEL: vid_denominator2_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, 260100
+; CHECK-NEXT: addi a0, a0, -256
+; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
+; CHECK-NEXT: ret
+ ret <2 x bfloat> <bfloat 0.5, bfloat 1.0>
+}
+
+define <2 x bfloat> @vid_step2_v2bf16() {
+; CHECK-LABEL: vid_step2_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vsll.vi v8, v8, 14
+; CHECK-NEXT: ret
+ ret <2 x bfloat> <bfloat 0.0, bfloat 2.0>
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV32ZVFBFMIN: {{.*}}
+; RV64: {{.*}}
+; RV64ZVFBFMIN: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 26ed4595ca7583..e3aabb5de29c28 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -4,8 +4,10 @@
; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zba,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFH
; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RV64V
; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+rva22u64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFH,RVA22U64
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN,RV32-NO-ZFHMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN,RV64-NO-ZFHMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVFHMIN,RV32-ZFHMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfhmin,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVFHMIN,RV64-ZFHMIN
; Tests that a floating-point build_vector doesn't try and generate a VID
; instruction
@@ -169,12 +171,95 @@ define <4 x half> @splat_c3_v4f16(<4 x half> %v) {
}
define <4 x half> @splat_idx_v4f16(<4 x half> %v, i64 %idx) {
-; CHECK-LABEL: splat_idx_v4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vrgather.vx v9, v8, a0
-; CHECK-NEXT: vmv1r.v v8, v9
-; CHECK-NEXT: ret
+; RV32ZVFH-LABEL: splat_idx_v4f16:
+; RV32ZVFH: # %bb.0:
+; RV32ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32ZVFH-NEXT: vrgather.vx v9, v8, a0
+; RV32ZVFH-NEXT: vmv1r.v v8, v9
+; RV32ZVFH-NEXT: ret
+;
+; RV64ZVFH-LABEL: splat_idx_v4f16:
+; RV64ZVFH: # %bb.0:
+; RV64ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64ZVFH-NEXT: vrgather.vx v9, v8, a0
+; RV64ZVFH-NEXT: vmv1r.v v8, v9
+; RV64ZVFH-NEXT: ret
+;
+; RV32-NO-ZFHMIN-LABEL: splat_idx_v4f16:
+; RV32-NO-ZFHMIN: # %bb.0:
+; RV32-NO-ZFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NO-ZFHMIN-NEXT: vrgather.vx v9, v8, a0
+; RV32-NO-ZFHMIN-NEXT: vmv1r.v v8, v9
+; RV32-NO-ZFHMIN-NEXT: ret
+;
+; RV64-NO-ZFHMIN-LABEL: splat_idx_v4f16:
+; RV64-NO-ZFHMIN: # %bb.0:
+; RV64-NO-ZFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NO-ZFHMIN-NEXT: vrgather.vx v9, v8, a0
+; RV64-NO-ZFHMIN-NEXT: vmv1r.v v8, v9
+; RV64-NO-ZFHMIN-NEXT: ret
+;
+; RV32-ZFHMIN-LABEL: splat_idx_v4f16:
+; RV32-ZFHMIN: # %bb.0:
+; RV32-ZFHMIN-NEXT: addi sp, sp, -48
+; RV32-ZFHMIN-NEXT: .cfi_def_cfa_offset 48
+; RV32-ZFHMIN-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-ZFHMIN-NEXT: .cfi_offset ra, -4
+; RV32-ZFHMIN-NEXT: csrr a1, vlenb
+; RV32-ZFHMIN-NEXT: slli a1, a1, 1
+; RV32-ZFHMIN-NEXT: sub sp, sp, a1
+; RV32-ZFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV32-ZFHMIN-NEXT: addi a1, sp, 32
+; RV32-ZFHMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-ZFHMIN-NEXT: andi a0, a0, 3
+; RV32-ZFHMIN-NEXT: li a1, 2
+; RV32-ZFHMIN-NEXT: call __mulsi3
+; RV32-ZFHMIN-NEXT: addi a1, sp, 16
+; RV32-ZFHMIN-NEXT: add a0, a1, a0
+; RV32-ZFHMIN-NEXT: addi a2, sp, 32
+; RV32-ZFHMIN-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-ZFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-ZFHMIN-NEXT: vse16.v v8, (a1)
+; RV32-ZFHMIN-NEXT: flh fa5, 0(a0)
+; RV32-ZFHMIN-NEXT: fmv.x.h a0, fa5
+; RV32-ZFHMIN-NEXT: vmv.v.x v8, a0
+; RV32-ZFHMIN-NEXT: csrr a0, vlenb
+; RV32-ZFHMIN-NEXT: slli a0, a0, 1
+; RV32-ZFHMIN-NEXT: add sp, sp, a0
+; RV32-ZFHMIN-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-ZFHMIN-NEXT: addi sp, sp, 48
+; RV32-ZFHMIN-NEXT: ret
+;
+; RV64-ZFHMIN-LABEL: splat_idx_v4f16:
+; RV64-ZFHMIN: # %bb.0:
+; RV64-ZFHMIN-NEXT: addi sp, sp, -48
+; RV64-ZFHMIN-NEXT: .cfi_def_cfa_offset 48
+; RV64-ZFHMIN-NEXT: sd ra, 40(sp) # 8-byte Folded Spill
+; RV64-ZFHMIN-NEXT: .cfi_offset ra, -8
+; RV64-ZFHMIN-NEXT: csrr a1, vlenb
+; RV64-ZFHMIN-NEXT: slli a1, a1, 1
+; RV64-ZFHMIN-NEXT: sub sp, sp, a1
+; RV64-ZFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV64-ZFHMIN-NEXT: addi a1, sp, 32
+; RV64-ZFHMIN-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-ZFHMIN-NEXT: andi a0, a0, 3
+; RV64-ZFHMIN-NEXT: li a1, 2
+; RV64-ZFHMIN-NEXT: call __muldi3
+; RV64-ZFHMIN-NEXT: addi a1, sp, 16
+; RV64-ZFHMIN-NEXT: add a0, a1, a0
+; RV64-ZFHMIN-NEXT: addi a2, sp, 32
+; RV64-ZFHMIN-NEXT: vl1r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-ZFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-ZFHMIN-NEXT: vse16.v v8, (a1)
+; RV64-ZFHMIN-NEXT: flh fa5, 0(a0)
+; RV64-ZFHMIN-NEXT: fmv.x.h a0, fa5
+; RV64-ZFHMIN-NEXT: vmv.v.x v8, a0
+; RV64-ZFHMIN-NEXT: csrr a0, vlenb
+; RV64-ZFHMIN-NEXT: slli a0, a0, 1
+; RV64-ZFHMIN-NEXT: add sp, sp, a0
+; RV64-ZFHMIN-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
+; RV64-ZFHMIN-NEXT: addi sp, sp, 48
+; RV64-ZFHMIN-NEXT: ret
%x = extractelement <4 x half> %v, i64 %idx
%ins = insertelement <4 x half> poison, half %x, i32 0
%splat = shufflevector <4 x half> %ins, <4 x half> poison, <4 x i32> zeroinitializer
@@ -295,23 +380,41 @@ define <2 x half> @buildvec_v2f16(half %a, half %b) {
; RV64ZVFH-NEXT: vfslide1down.vf v8, v8, fa1
; RV64ZVFH-NEXT: ret
;
-; RV32ZVFHMIN-LABEL: buildvec_v2f16:
-; RV32ZVFHMIN: # %bb.0:
-; RV32ZVFHMIN-NEXT: fmv.x.w a0, fa1
-; RV32ZVFHMIN-NEXT: fmv.x.w a1, fa0
-; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; RV32ZVFHMIN-NEXT: vmv.v.x v8, a1
-; RV32ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0
-; RV32ZVFHMIN-NEXT: ret
+; RV32-NO-ZFHMIN-LABEL: buildvec_v2f16:
+; RV32-NO-ZFHMIN: # %bb.0:
+; RV32-NO-ZFHMIN-NEXT: fmv.x.w a0, fa1
+; RV32-NO-ZFHMIN-NEXT: fmv.x.w a1, fa0
+; RV32-NO-ZFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NO-ZFHMIN-NEXT: vmv.v.x v8, a1
+; RV32-NO-ZFHMIN-NEXT: vslide1down.vx v8, v8, a0
+; RV32-NO-ZFHMIN-NEXT: ret
;
-; RV64ZVFHMIN-LABEL: buildvec_v2f16:
-; RV64ZVFHMIN: # %bb.0:
-; RV64ZVFHMIN-NEXT: fmv.x.w a0, fa1
-; RV64ZVFHMIN-NEXT: fmv.x.w a1, fa0
-; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; RV64ZVFHMIN-NEXT: vmv.v.x v8, a1
-; RV64ZVFHMIN-NEXT: vslide1down.vx v8, v8, a0
-; R...
[truncated]
|
; RV64-NO-ZFHMIN-NEXT: vmv1r.v v8, v9 | ||
; RV64-NO-ZFHMIN-NEXT: ret | ||
; | ||
; RV32-ZFHMIN-LABEL: splat_idx_v4f16: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is caused by not handling extract_vector_elt causing it to expand through memory.
@@ -3924,38 +3924,55 @@ static SDValue lowerBuildVectorViaPacking(SDValue Op, SelectionDAG &DAG, | |||
DAG.getBuildVector(WideVecVT, DL, NewOperands)); | |||
} | |||
|
|||
// Convert to an vXf16 build_vector to vXi16 with bitcasts. | |||
static SDValue lowerBUILD_VECTORvXf16(SDValue Op, SelectionDAG &DAG) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand the removal of this combined with the review description. We clearly did handle some non-splat cases here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We only handled them if the Zfbfmin/Zfhmin were not enabled. My description might have been unclear.
// can't create illegal types. | ||
if (auto *C = dyn_cast<ConstantFPSDNode>(Elem)) { | ||
// Manually constant fold. | ||
// FIXME: Add a constant fold combine for FMV_X_ANYEXTH. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe just implement the constant folding combine instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried but it produces worse code because the fold won't happen before the integer build_vector is lowered. We need LowerBUILD_VECTOR to see it as a vector of constants.
; RV64-NO-ZFBFMIN-NEXT: vmv1r.v v8, v9 | ||
; RV64-NO-ZFBFMIN-NEXT: ret | ||
; | ||
; RV32-ZFBFMIN-LABEL: splat_idx_v4bf16: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fact that the version with the extension generates worse code than the one without seems not ideal.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. Without the scalar FP extension the extract_vector_elt gets type legalized by the scalar type legalizer. With the extension it makes it through the type legalizer and get expanded by LegalizeDAG.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@preames do you want the extract_vector_elt case fixed before this goes in?
(EltVT == MVT::f16 && Subtarget.hasStdExtZfhmin())) { | ||
// Called by LegalizeDAG, we need to use XLenVT operations since we | ||
// can't create illegal types. | ||
if (auto *C = dyn_cast<ConstantFPSDNode>(Elem)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a thought for possible followup. SelectionDAG::getNode eagerly does constant folding these days, but there's no mechanism to fold target nodes. Maybe it's time to add a tryConstantFoldTargetNode hook?
This adds VL patterns for vfwmaccbf16.vv so that we can handle fixed length vectors. It does this by teaching combineOp_VLToVWOp_VL to emit RISCVISD::VFWMADD_VL for bf16. The change in getOrCreateExtendedOp is needed because getNarrowType is based off of the bitwidth so returns f16. We need to explicitly check for bf16. Note that the .vf patterns don't work yet, since the build_vector pattern gets lowered to a vmv.v.x not a vfmv.v.f which SplatFP doesn't pick up, see llvm#106637.
This adds VL patterns for vfwmaccbf16.vv so that we can handle fixed length vectors. It does this by teaching combineOp_VLToVWOp_VL to emit RISCVISD::VFWMADD_VL for bf16. The change in getOrCreateExtendedOp is needed because getNarrowType is based off of the bitwidth so returns f16. We need to explicitly check for bf16. Note that the .vf patterns don't work yet, since the build_vector splat gets lowered to a (vmv_v_x_vl (fmv_x_anyexth x)) instead of a vfmv.v.f, which SplatFP doesn't pick up, see #106637.
Previously, if Zfbfmin/Zfhmin were enabled, we only handled build_vectors that could be turned into splat_vectors. We promoted them to f32 splats by extending in the scalar domain and narrowing in the vector domain.
This patch fixes a crash where we failed to account for whether the f32 vector type fit in LMUL<=8.
Because the new lowering occurs after type legalization, we have to be careful to use XLenVT for the scalar integer type and use custom cast nodes.