-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Enable scalable loop vectorization for zvfhmin/zvfbfmin #115272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Enable scalable loop vectorization for zvfhmin/zvfbfmin #115272
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesThis PR enables scalable loop vectorization for f16 with zvfhmin and bf16 with zvfbfmin. Enabling this was dependent on filling out the gaps for scalable zvfhmin/zvfbfmin codegen, but everything that the loop vectorizer might emit should now be handled. It does this by marking f16 and bf16 as legal in
The remaining user is After this PR, the following loop compiled with void f(float * restrict dst, __bf16 * restrict a, __bf16 * restrict b, int n) {
for (int i = 0; i < n; i++)
dst[i] += ((float)a[i] * (float)b[i]);
} Goes from this in llvm-19.1.0: .LBB0_4:
addi t1, t0, 16
vsetivli zero, 8, e16, m1, ta, ma
vle16.v v8, (t0)
vle16.v v9, (t1)
addi t1, a6, 32
addi t2, a7, 16
vfwcvtbf16.f.f.v v10, v8
vfwcvtbf16.f.f.v v12, v9
vle16.v v8, (a7)
vle16.v v9, (t2)
vle32.v v14, (a6)
vle32.v v16, (t1)
vfwcvtbf16.f.f.v v18, v8
vfwcvtbf16.f.f.v v20, v9
vsetvli zero, zero, e32, m2, ta, ma
vfmacc.vv v14, v10, v18
vfmacc.vv v16, v12, v20
vse32.v v14, (a6)
vse32.v v16, (t1)
addi t0, t0, 32
addi a7, a7, 32
addi a6, a6, 64
bne t0, a5, .LBB0_4 To: vsetvli t4, zero, e16, m1, ta, ma
.LBB0_4:
vl1re16.v v8, (t3)
vl1re16.v v9, (t2)
vl2re32.v v10, (t1)
vfwmaccbf16.vv v10, v8, v9
vs2r.v v10, (t1)
add t3, t3, a4
add t2, t2, a4
sub t0, t0, a6
add t1, t1, a7
bnez t0, .LBB0_4 Patch is 32.46 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115272.diff 5 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5600524b69a620..7a27afa17c9319 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2516,7 +2516,9 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
case MVT::i64:
return Subtarget.hasVInstructionsI64();
case MVT::f16:
- return Subtarget.hasVInstructionsF16();
+ return Subtarget.hasVInstructionsF16Minimal();
+ case MVT::bf16:
+ return Subtarget.hasVInstructionsBF16Minimal();
case MVT::f32:
return Subtarget.hasVInstructionsF32();
case MVT::f64:
@@ -21509,12 +21511,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
if (!isTypeLegal(VT))
return false;
- // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
- if (!(isLegalElementTypeForRVV(VT.getScalarType()) ||
- (VT.getScalarType() == MVT::bf16 &&
- Subtarget.hasVInstructionsBF16Minimal()) ||
- (VT.getScalarType() == MVT::f16 &&
- Subtarget.hasVInstructionsF16Minimal())) ||
+ if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
!allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
Alignment))
return false;
@@ -21554,10 +21551,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
return false;
EVT ScalarType = DataType.getScalarType();
- // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
- if (!(isLegalElementTypeForRVV(ScalarType) ||
- (ScalarType == MVT::bf16 && Subtarget.hasVInstructionsBF16Minimal()) ||
- (ScalarType == MVT::f16 && Subtarget.hasVInstructionsF16Minimal())))
+ if (!isLegalElementTypeForRVV(ScalarType))
return false;
if (!Subtarget.enableUnalignedVectorMem() &&
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 4c01c1679cd818..bbbe101745f0e3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -239,12 +239,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
- return TLI->isLegalElementTypeForRVV(ElemType) ||
- (DataTypeVT.getVectorElementType() == MVT::bf16 &&
- ST->hasVInstructionsBF16Minimal()) ||
- (DataTypeVT.getVectorElementType() == MVT::f16 &&
- ST->hasVInstructionsF16Minimal());
+ return TLI->isLegalElementTypeForRVV(ElemType);
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
@@ -274,12 +269,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
- return TLI->isLegalElementTypeForRVV(ElemType) ||
- (DataTypeVT.getVectorElementType() == MVT::bf16 &&
- ST->hasVInstructionsBF16Minimal()) ||
- (DataTypeVT.getVectorElementType() == MVT::f16 &&
- ST->hasVInstructionsF16Minimal());
+ return TLI->isLegalElementTypeForRVV(ElemType);
}
bool isLegalMaskedGather(Type *DataType, Align Alignment) {
@@ -342,8 +332,14 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
return false;
switch (RdxDesc.getRecurrenceKind()) {
- case RecurKind::Add:
case RecurKind::FAdd:
+ case RecurKind::FMulAdd:
+ // We can't promote f16/bf16 fadd reductions and scalable vectors can't be
+ // expanded.
+ if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
+ return false;
+ [[fallthrough]];
+ case RecurKind::Add:
case RecurKind::And:
case RecurKind::Or:
case RecurKind::Xor:
@@ -353,7 +349,6 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
case RecurKind::UMax:
case RecurKind::FMin:
case RecurKind::FMax:
- case RecurKind::FMulAdd:
case RecurKind::IAnyOf:
case RecurKind::FAnyOf:
return true;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
new file mode 100644
index 00000000000000..27923f82411d00
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFBFMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -S | FileCheck %s -check-prefix=ZVFBFMIN
+
+define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
+; NO-ZVFBFMIN-LABEL: define void @fadd(
+; NO-ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFBFMIN-NEXT: [[ENTRY:.*]]:
+; NO-ZVFBFMIN-NEXT: br label %[[LOOP:.*]]
+; NO-ZVFBFMIN: [[LOOP]]:
+; NO-ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-NEXT: [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
+; NO-ZVFBFMIN-NEXT: store bfloat [[Z]], ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-ZVFBFMIN: [[EXIT]]:
+; NO-ZVFBFMIN-NEXT: ret void
+;
+; ZVFBFMIN-LABEL: define void @fadd(
+; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; ZVFBFMIN-NEXT: [[ENTRY:.*]]:
+; ZVFBFMIN-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; ZVFBFMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]]
+; ZVFBFMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ZVFBFMIN: [[VECTOR_PH]]:
+; ZVFBFMIN-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
+; ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFBFMIN-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP12]], 8
+; ZVFBFMIN-NEXT: br label %[[VECTOR_BODY:.*]]
+; ZVFBFMIN: [[VECTOR_BODY]]:
+; ZVFBFMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ZVFBFMIN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; ZVFBFMIN-NEXT: [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
+; ZVFBFMIN-NEXT: [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
+; ZVFBFMIN-NEXT: [[TMP3:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
+; ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT: [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
+; ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP4]], align 2
+; ZVFBFMIN-NEXT: [[TMP11:%.*]] = fadd <vscale x 8 x bfloat> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; ZVFBFMIN-NEXT: store <vscale x 8 x bfloat> [[TMP11]], ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; ZVFBFMIN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVFBFMIN-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; ZVFBFMIN: [[MIDDLE_BLOCK]]:
+; ZVFBFMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; ZVFBFMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; ZVFBFMIN: [[SCALAR_PH]]:
+; ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ZVFBFMIN-NEXT: br label %[[LOOP:.*]]
+; ZVFBFMIN: [[LOOP]]:
+; ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; ZVFBFMIN-NEXT: [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
+; ZVFBFMIN-NEXT: store bfloat [[Z]], ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1
+; ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; ZVFBFMIN: [[EXIT]]:
+; ZVFBFMIN-NEXT: ret void
+;
+entry:
+ br label %loop
+loop:
+ %i = phi i64 [0, %entry], [%i.next, %loop]
+ %a.gep = getelementptr bfloat, ptr %a, i64 %i
+ %b.gep = getelementptr bfloat, ptr %b, i64 %i
+ %x = load bfloat, ptr %a.gep
+ %y = load bfloat, ptr %b.gep
+ %z = fadd bfloat %x, %y
+ store bfloat %z, ptr %a.gep
+ %i.next = add i64 %i, 1
+ %done = icmp eq i64 %i.next, %n
+ br i1 %done, label %exit, label %loop
+exit:
+ ret void
+}
+
+define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
+; NO-ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
+; NO-ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-ZVFBFMIN-NEXT: [[ENTRY:.*]]:
+; NO-ZVFBFMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; NO-ZVFBFMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-ZVFBFMIN: [[VECTOR_PH]]:
+; NO-ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-ZVFBFMIN-NEXT: br label %[[VECTOR_BODY:.*]]
+; NO-ZVFBFMIN: [[VECTOR_BODY]]:
+; NO-ZVFBFMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-ZVFBFMIN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-ZVFBFMIN-NEXT: [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT: [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT: [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
+; NO-ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = load <8 x bfloat>, ptr [[TMP4]], align 2
+; NO-ZVFBFMIN-NEXT: [[TMP5:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
+; NO-ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x bfloat>, ptr [[TMP5]], align 2
+; NO-ZVFBFMIN-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[TMP3]], i32 0
+; NO-ZVFBFMIN-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP6]], align 4
+; NO-ZVFBFMIN-NEXT: [[TMP7:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD]] to <8 x float>
+; NO-ZVFBFMIN-NEXT: [[TMP8:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD1]] to <8 x float>
+; NO-ZVFBFMIN-NEXT: [[TMP9:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP7]], <8 x float> [[TMP8]], <8 x float> [[WIDE_LOAD2]])
+; NO-ZVFBFMIN-NEXT: store <8 x float> [[TMP9]], ptr [[TMP6]], align 4
+; NO-ZVFBFMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-ZVFBFMIN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-ZVFBFMIN-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-ZVFBFMIN: [[MIDDLE_BLOCK]]:
+; NO-ZVFBFMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-ZVFBFMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-ZVFBFMIN: [[SCALAR_PH]]:
+; NO-ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NO-ZVFBFMIN-NEXT: br label %[[LOOP:.*]]
+; NO-ZVFBFMIN: [[LOOP]]:
+; NO-ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT: [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-NEXT: [[Z:%.*]] = load float, ptr [[C_GEP]], align 4
+; NO-ZVFBFMIN-NEXT: [[X_EXT:%.*]] = fpext bfloat [[X]] to float
+; NO-ZVFBFMIN-NEXT: [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
+; NO-ZVFBFMIN-NEXT: [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
+; NO-ZVFBFMIN-NEXT: store float [[FMULADD]], ptr [[C_GEP]], align 4
+; NO-ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-ZVFBFMIN: [[EXIT]]:
+; NO-ZVFBFMIN-NEXT: ret void
+;
+; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
+; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; ZVFBFMIN-NEXT: [[ENTRY:.*]]:
+; ZVFBFMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVFBFMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; ZVFBFMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ZVFBFMIN: [[VECTOR_PH]]:
+; ZVFBFMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFBFMIN-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVFBFMIN-NEXT: br label %[[VECTOR_BODY:.*]]
+; ZVFBFMIN: [[VECTOR_BODY]]:
+; ZVFBFMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ZVFBFMIN-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; ZVFBFMIN-NEXT: [[TMP7:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT: [[TMP8:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT: [[TMP10:%.*]] = getelementptr bfloat, ptr [[TMP7]], i32 0
+; ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP10]], align 2
+; ZVFBFMIN-NEXT: [[TMP11:%.*]] = getelementptr bfloat, ptr [[TMP8]], i32 0
+; ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP11]], align 2
+; ZVFBFMIN-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
+; ZVFBFMIN-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT: [[TMP13:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD]] to <vscale x 4 x float>
+; ZVFBFMIN-NEXT: [[TMP14:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD1]] to <vscale x 4 x float>
+; ZVFBFMIN-NEXT: [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD2]])
+; ZVFBFMIN-NEXT: store <vscale x 4 x float> [[TMP15]], ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; ZVFBFMIN-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVFBFMIN-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; ZVFBFMIN: [[MIDDLE_BLOCK]]:
+; ZVFBFMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; ZVFBFMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; ZVFBFMIN: [[SCALAR_PH]]:
+; ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ZVFBFMIN-NEXT: br label %[[LOOP:.*]]
+; ZVFBFMIN: [[LOOP]]:
+; ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; ZVFBFMIN-NEXT: [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
+; ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; ZVFBFMIN-NEXT: [[Z:%.*]] = load float, ptr [[C_GEP]], align 4
+; ZVFBFMIN-NEXT: [[X_EXT:%.*]] = fpext bfloat [[X]] to float
+; ZVFBFMIN-NEXT: [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
+; ZVFBFMIN-NEXT: [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
+; ZVFBFMIN-NEXT: store float [[FMULADD]], ptr [[C_GEP]], align 4
+; ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1
+; ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; ZVFBFMIN: [[EXIT]]:
+; ZVFBFMIN-NEXT: ret void
+;
+entry:
+ br label %loop
+loop:
+ %i = phi i64 [0, %entry], [%i.next, %loop]
+ %a.gep = getelementptr bfloat, ptr %a, i64 %i
+ %b.gep = getelementptr bfloat, ptr %b, i64 %i
+ %c.gep = getelementptr float, ptr %c, i64 %i
+ %x = load bfloat, ptr %a.gep
+ %y = load bfloat, ptr %b.gep
+ %z = load float, ptr %c.gep
+ %x.ext = fpext bfloat %x to float
+ %y.ext = fpext bfloat %y to float
+ %fmuladd = call float @llvm.fmuladd.f32(float %x.ext, float %y.ext, float %z)
+ store float %fmuladd, ptr %c.gep
+ %i.next = add i64 %i, 1
+ %done = icmp eq i64 %i.next, %n
+ br i1 %done, label %exit, label %loop
+exit:
+ ret void
+}
+;.
+; NO-ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; ZVFBFMIN: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; ZVFBFMIN: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
new file mode 100644
index 00000000000000..2b267f6a2a9778
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFHMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -S | FileCheck %s -check-prefix=ZVFHMIN
+
+define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
+; NO-ZVFHMIN-LABEL: define void @fadd(
+; NO-ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFHMIN-NEXT: [[ENTRY:.*]]:
+; NO-ZVFHMIN-NEXT: br label %[[LOOP:.*]]
+; NO-ZVFHMIN: [[LOOP]]:
+; NO-ZVFHMIN-NEXT: [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFHMIN-NEXT: [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
+; NO-ZVFHMIN-NEXT: [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
+; NO-ZVFHMIN-NEXT: [[X:%.*]] = load half, ptr [[A_GEP]], align 2
+; NO-ZVFHMIN-NEXT: [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
+; NO-ZVFHMIN-NEXT: [[Z:%.*]] = fadd half [[X]], [[Y]]
+; NO-ZVFHMIN-NEXT...
[truncated]
|
What is the result if we change data type in TSVC2 to bf16? Can we handle all cases correctly? |
; NO-ZVFBFMIN-NEXT: [[TMP7:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD]] to <8 x float> | ||
; NO-ZVFBFMIN-NEXT: [[TMP8:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD1]] to <8 x float> | ||
; NO-ZVFBFMIN-NEXT: [[TMP9:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP7]], <8 x float> [[TMP8]], <8 x float> [[WIDE_LOAD2]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an existing problem on ToT where we're not costing a bunch of these fixed vector instructions properly whenever it's not a legal element type, which leads this to be vectorized when there's no +zvfbfmin, e.g.:
LV: Found an estimated cost of 2 for VF 8 For instruction: %x.ext = fpext bfloat %x to float
LV: Found an estimated cost of 2 for VF 8 For instruction: %y.ext = fpext bfloat %y to float
LV: Found an estimated cost of 2 for VF 8 For instruction: %fmuladd = tail call float @llvm.fmuladd.f32(float %x.ext, float %y.ext, float %z)
It turns out we don't actually promote fmin/fmax reductions yet, and they will crash when lowering. I think we should be able to promote these and remove the restriction in a follow up PR.
Good idea, this actually caught a crash because it turns out we also can't lower fmax/fmin reductions, so I've also disabled these for now. I think we can promote these in a later PR. Otherwise, TSVC2 seems to compile fine and emits a healthy amount of vfwcvtbf16.f.f.v and vfwmaccbf16.vvs. I can try running it on QEMU tomorrow. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The run finished, the checksums are the same between the vectorized and non-vectorized binaries so I presume the lowering is correct |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/123/builds/9126 Here is the relevant piece of the build log for the reference
|
…#115272) This PR enables scalable loop vectorization for f16 with zvfhmin and bf16 with zvfbfmin. Enabling this was dependent on filling out the gaps for scalable zvfhmin/zvfbfmin codegen, but everything that the loop vectorizer might emit should now be handled. It does this by marking f16 and bf16 as legal in `isLegalElementTypeForRVV`. There are a few users of `isLegalElementTypeForRVV` that have already been enabled in other PRs: - `isLegalStridedLoadStore` llvm#115264 - `isLegalInterleavedAccessType` llvm#115257 - `isLegalMaskedLoadStore` llvm#115145 - `isLegalMaskedGatherScatter` llvm#114945 The remaining user is `isLegalToVectorizeReduction`. We can't promote f16/bf16 reductions to f32 so we need to disable them for scalable vectors. The cost model actually marks these as invalid, but for out-of-tree reductions `ComputeReductionResult` doesn't get costed and it will end up emitting a reduction intrinsic regardless, so we still need to mark them as illegal. We might be able to remove this restriction later for fmax and fmin reductions.
This PR enables scalable loop vectorization for f16 with zvfhmin and bf16 with zvfbfmin.
Enabling this was dependent on filling out the gaps for scalable zvfhmin/zvfbfmin codegen, but everything that the loop vectorizer might emit should now be handled.
It does this by marking f16 and bf16 as legal in
isLegalElementTypeForRVV
. There are a few users ofisLegalElementTypeForRVV
that have already been enabled in other PRs:isLegalStridedLoadStore
[RISCV] Allow f16/bf16 with zvfhmin/zvfbfmin as legal strided access #115264isLegalInterleavedAccessType
[RISCV] Allow f16/bf16 with zvfhmin/zvfbfmin as legal interleaved access #115257isLegalMaskedLoadStore
[RISCV] Lower fixed-length mload/mstore for zvfhmin/zvfbfmin #115145isLegalMaskedGatherScatter
[RISCV] Lower fixed-length mgather/mscatter for zvfhmin/zvfbfmin #114945The remaining user is
isLegalToVectorizeReduction
. We can't promote f16/bf16 reductions to f32 so we need to disable them for scalable vectors. The cost model actually marks these as invalid, but for out-of-tree reductionsComputeReductionResult
doesn't get costed and it will end up emitting a reduction intrinsic regardless, so we still need to mark them as illegal. We might be able to remove this restriction later for fmax and fmin reductions.After this PR, the following loop compiled with
-march=rv64gv_zvfbfwma -O2
:Goes from this in llvm-19.1.0:
To: