-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Lower fixed-length mload/mstore for zvfhmin/zvfbfmin #115145
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesThis is the same idea as #114945. The tests contain a setcc which needs promoted, so at v64[b]f16 and above it ends up getting expanded because it can't promote to LMUL 16. Patch is 363.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115145.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index aaa10aaeb22d37..a625e9d5efeb55 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1339,9 +1339,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VECTOR_SHUFFLE, ISD::VECTOR_COMPRESS},
VT, Custom);
- // FIXME: mload, mstore, vp_gather/scatter can be
- // hoisted to here.
- setOperationAction({ISD::LOAD, ISD::STORE, ISD::MGATHER, ISD::MSCATTER},
+ // FIXME: vp_gather/scatter can be hoisted to here.
+ setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
+ ISD::MGATHER, ISD::MSCATTER},
VT, Custom);
setOperationAction({ISD::VP_LOAD, ISD::VP_STORE,
ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
@@ -1409,8 +1409,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::BUILD_VECTOR, ISD::SCALAR_TO_VECTOR}, VT,
Custom);
- setOperationAction({ISD::MLOAD, ISD::MSTORE}, VT, Custom);
-
setOperationAction({ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom);
setOperationAction({ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index eaaa035710facc..4c01c1679cd818 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -239,8 +239,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- return TLI->isLegalElementTypeForRVV(ElemType);
-
+ // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
+ return TLI->isLegalElementTypeForRVV(ElemType) ||
+ (DataTypeVT.getVectorElementType() == MVT::bf16 &&
+ ST->hasVInstructionsBF16Minimal()) ||
+ (DataTypeVT.getVectorElementType() == MVT::f16 &&
+ ST->hasVInstructionsF16Minimal());
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
index f1d300b300a646..ede0939a928f51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
@@ -1,17 +1,51 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32,RV32-ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64,RV64-ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin,+zfhmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32,RV32-ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin,+zfhmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64,RV64-ZVFHMIN
-define void @masked_load_v1f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v1f16:
+define void @masked_load_v1bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v1bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v9, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <1 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <1 x bfloat> %m, zeroinitializer
+ %load = call <1 x bfloat> @llvm.masked.load.v1bf16(ptr %a, i32 8, <1 x i1> %mask, <1 x bfloat> undef)
+ store <1 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <1 x bfloat> @llvm.masked.load.v1bf16(ptr, i32, <1 x i1>, <1 x bfloat>)
+
+define void @masked_load_v1f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <1 x half>, ptr %m_ptr
%mask = fcmp oeq <1 x half> %m, zeroinitializer
%load = call <1 x half> @llvm.masked.load.v1f16(ptr %a, i32 8, <1 x i1> %mask, <1 x half> undef)
@@ -66,16 +100,48 @@ define void @masked_load_v1f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <1 x double> @llvm.masked.load.v1f64(ptr, i32, <1 x i1>, <1 x double>)
-define void @masked_load_v2f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v2f16:
+define void @masked_load_v2bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v9, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <2 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <2 x bfloat> %m, zeroinitializer
+ %load = call <2 x bfloat> @llvm.masked.load.v2bf16(ptr %a, i32 8, <2 x i1> %mask, <2 x bfloat> undef)
+ store <2 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <2 x bfloat> @llvm.masked.load.v2bf16(ptr, i32, <2 x i1>, <2 x bfloat>)
+
+define void @masked_load_v2f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <2 x half>, ptr %m_ptr
%mask = fcmp oeq <2 x half> %m, zeroinitializer
%load = call <2 x half> @llvm.masked.load.v2f16(ptr %a, i32 8, <2 x i1> %mask, <2 x half> undef)
@@ -130,16 +196,48 @@ define void @masked_load_v2f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <2 x double> @llvm.masked.load.v2f64(ptr, i32, <2 x i1>, <2 x double>)
-define void @masked_load_v4f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v4f16:
+define void @masked_load_v4bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v9, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <4 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <4 x bfloat> %m, zeroinitializer
+ %load = call <4 x bfloat> @llvm.masked.load.v4bf16(ptr %a, i32 8, <4 x i1> %mask, <4 x bfloat> undef)
+ store <4 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <4 x bfloat> @llvm.masked.load.v4bf16(ptr, i32, <4 x i1>, <4 x bfloat>)
+
+define void @masked_load_v4f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <4 x half>, ptr %m_ptr
%mask = fcmp oeq <4 x half> %m, zeroinitializer
%load = call <4 x half> @llvm.masked.load.v4f16(ptr %a, i32 8, <4 x i1> %mask, <4 x half> undef)
@@ -194,16 +292,48 @@ define void @masked_load_v4f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <4 x double> @llvm.masked.load.v4f64(ptr, i32, <4 x i1>, <4 x double>)
-define void @masked_load_v8f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v8f16:
+define void @masked_load_v8bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v10, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <8 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <8 x bfloat> %m, zeroinitializer
+ %load = call <8 x bfloat> @llvm.masked.load.v8bf16(ptr %a, i32 8, <8 x i1> %mask, <8 x bfloat> undef)
+ store <8 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <8 x bfloat> @llvm.masked.load.v8bf16(ptr, i32, <8 x i1>, <8 x bfloat>)
+
+define void @masked_load_v8f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v10, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <8 x half>, ptr %m_ptr
%mask = fcmp oeq <8 x half> %m, zeroinitializer
%load = call <8 x half> @llvm.masked.load.v8f16(ptr %a, i32 8, <8 x i1> %mask, <8 x half> undef)
@@ -258,16 +388,48 @@ define void @masked_load_v8f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <8 x double> @llvm.masked.load.v8f64(ptr, i32, <8 x i1>, <8 x double>)
-define void @masked_load_v16f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v16f16:
+define void @masked_load_v16bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v12, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <16 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <16 x bfloat> %m, zeroinitializer
+ %load = call <16 x bfloat> @llvm.masked.load.v16bf16(ptr %a, i32 8, <16 x i1> %mask, <16 x bfloat> undef)
+ store <16 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <16 x bfloat> @llvm.masked.load.v16bf16(ptr, i32, <16 x i1>, <16 x bfloat>)
+
+define void @masked_load_v16f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v12, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <16 x half>, ptr %m_ptr
%mask = fcmp oeq <16 x half> %m, zeroinitializer
%load = call <16 x half> @llvm.masked.load.v16f16(ptr %a, i32 8, <16 x i1> %mask, <16 x half> undef)
@@ -322,17 +484,51 @@ define void @masked_load_v16f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <16 x double> @llvm.masked.load.v16f64(ptr, i32, <16 x i1>, <16 x double>)
-define void @masked_load_v32f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v32f16:
+define void @masked_load_v32bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v32bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: li a3, 32
; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v16, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <32 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <32 x bfloat> %m, zeroinitializer
+ %load = call <32 x bfloat> @llvm.masked.load.v32bf16(ptr %a, i32 8, <32 x i1> %mask, <32 x bfloat> undef)
+ store <32 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <32 x bfloat> @llvm.masked.load.v32bf16(ptr, i32, <32 x i1>, <32 x bfloat>)
+
+define void @masked_load_v32f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: li a3, 32
+; ZVFH-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: li a3, 32
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v16, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <32 x half>, ptr %m_ptr
%mask = fcmp oeq <32 x half> %m, zeroinitializer
%load = call <32 x half> @llvm.masked.load.v32f16(ptr %a, i32 8, <32 x i1> %mask, <32 x half> undef)
@@ -404,17 +600,1477 @@ define void @masked_load_v32f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <32 x double> @llvm.masked.load.v32f64(ptr, i32, <32 x i1>, <32 x double>)
+define void @masked_load_v64bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; RV32-LABEL: masked_load_v64bf16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -384
+; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 384
+; RV32-NEXT: andi sp, sp, -128
+; RV32-NEXT: li a3, 64
+; RV32-NEXT: vsetvli zero, a3, e16, m8, ta, ma
+; RV32-NEXT: vle16.v v8, (a1)
+; RV32-NEXT: addi a1, sp, 128
+; RV32-NEXT: vse16.v v8, (a1)
+; RV32-NEXT: lh a1, 192(sp)
+; RV32-NEXT: fmv.h.x fa5, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa5
+; RV32-NEXT: fmv.w.x fa5, zero
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 96(sp)
+; RV32-NEXT: lh a1, 190(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 95(sp)
+; RV32-NEXT: lh a1, 188(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 94(sp)
+; RV32-NEXT: lh a1, 186(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 93(sp)
+; RV32-NEXT: lh a1, 184(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 92(sp)
+; RV32-NEXT: lh a1, 182(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 91(sp)
+; RV32-NEXT: lh a1, 180(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 90(sp)
+; RV32-NEXT: lh a1, 178(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 89(sp)
+; RV32-NEXT: lh a1, 176(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 88(sp)
+; RV32-NEXT: lh a1, 174(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 87(sp)
+; RV32-NEXT: lh a1, 172(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 86(sp)
+; RV32-NEXT: lh a1, 170(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 85(sp)
+; RV32-NEXT: lh a1, 168(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 84(sp)
+; RV32-NEXT: lh a1, 166(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 83(sp)
+; RV32-NEXT: lh a1, 164(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 82(sp)
+; RV32-NEXT: lh a1, 162(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 81(sp)
+; RV32-NEXT: lh a1, 160(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
; RV64-ZVFHMIN-NEXT: ld ra, 504(sp) # 8-byte Folded Reload | ||
; RV64-ZVFHMIN-NEXT: ld s0, 496(sp) # 8-byte Folded Reload | ||
; RV64-ZVFHMIN-NEXT: addi sp, sp, 512 | ||
; RV64-ZVFHMIN-NEXT: ret | ||
%m = load <128 x half>, ptr %m_ptr |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is more of a comment on the test structure than anything else, and definitely non-blocking.
Most of the run lines in these files appear to come from the mask generation, not the actual masked load or store. Maybe you should just load a mask from memory or pass one in as a param instead? Testing the mask generation is covered (presumably) in other test files, and doesn't need to be duplicated here as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I was thinking that too. I think this test was added back in the early days of fixed vector support before there was better rounded test coverage. I'll remove the fcmps in an NFC and rebase.
This is the same idea as llvm#114945.
d499494
to
3e9ffc6
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/9716 Here is the relevant piece of the build log for the reference
|
) This PR enables scalable loop vectorization for f16 with zvfhmin and bf16 with zvfbfmin. Enabling this was dependent on filling out the gaps for scalable zvfhmin/zvfbfmin codegen, but everything that the loop vectorizer might emit should now be handled. It does this by marking f16 and bf16 as legal in `isLegalElementTypeForRVV`. There are a few users of `isLegalElementTypeForRVV` that have already been enabled in other PRs: - `isLegalStridedLoadStore` #115264 - `isLegalInterleavedAccessType` #115257 - `isLegalMaskedLoadStore` #115145 - `isLegalMaskedGatherScatter` #114945 The remaining user is `isLegalToVectorizeReduction`. We can't promote f16/bf16 reductions to f32 so we need to disable them for scalable vectors. The cost model actually marks these as invalid, but for out-of-tree reductions `ComputeReductionResult` doesn't get costed and it will end up emitting a reduction intrinsic regardless, so we still need to mark them as illegal. We might be able to remove this restriction later for fmax and fmin reductions.
…#115272) This PR enables scalable loop vectorization for f16 with zvfhmin and bf16 with zvfbfmin. Enabling this was dependent on filling out the gaps for scalable zvfhmin/zvfbfmin codegen, but everything that the loop vectorizer might emit should now be handled. It does this by marking f16 and bf16 as legal in `isLegalElementTypeForRVV`. There are a few users of `isLegalElementTypeForRVV` that have already been enabled in other PRs: - `isLegalStridedLoadStore` llvm#115264 - `isLegalInterleavedAccessType` llvm#115257 - `isLegalMaskedLoadStore` llvm#115145 - `isLegalMaskedGatherScatter` llvm#114945 The remaining user is `isLegalToVectorizeReduction`. We can't promote f16/bf16 reductions to f32 so we need to disable them for scalable vectors. The cost model actually marks these as invalid, but for out-of-tree reductions `ComputeReductionResult` doesn't get costed and it will end up emitting a reduction intrinsic regardless, so we still need to mark them as illegal. We might be able to remove this restriction later for fmax and fmin reductions.
This is the same idea as #114945.