-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV][CostModel] Add cost for fabs/fsqrt of type bf16/f16 #118608
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-analysis Author: LiqinWeng (LiqinWeng) ChangesFull diff: https://github.com/llvm/llvm-project/pull/118608.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index aed476db1956fa..38e9dfe1e44e96 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1035,21 +1035,41 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
- case Intrinsic::fabs:
+ case Intrinsic::fabs: {
+ auto LT = getTypeLegalizationCost(RetTy);
+ if (ST->hasVInstructions() && LT.second.isVector()) {
+ // lui a0, 8
+ // addi a0, a0, -1
+ // vsetvli a1, zero, e16, m1, ta, ma
+ // vand.vx v8, v8, a0
+ // f16 with zvfhmin and bf16 with zvfhbmin
+ if (LT.second.getVectorElementType() == MVT::bf16 ||
+ (LT.second.getVectorElementType() == MVT::f16 &&
+ !ST->hasVInstructionsF16()))
+ return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
+ CostKind) +
+ 2;
+ else
+ return LT.first *
+ getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
+ }
+ break;
+ }
case Intrinsic::sqrt: {
auto LT = getTypeLegalizationCost(RetTy);
- // TODO: add f16/bf16, bf16 with zvfbfmin && f16 with zvfhmin
if (ST->hasVInstructions() && LT.second.isVector()) {
- unsigned Op;
- switch (ICA.getID()) {
- case Intrinsic::fabs:
- Op = RISCV::VFSGNJX_VV;
- break;
- case Intrinsic::sqrt:
- Op = RISCV::VFSQRT_V;
- break;
- }
- return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
+ SmallVector<unsigned, 3> Opcodes;
+ // f16 with zvfhmin and bf16 with zvfbfmin
+ if (LT.second.getVectorElementType() == MVT::bf16 &&
+ ST->hasVInstructionsBF16Minimal())
+ Opcodes = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFSQRT_V,
+ RISCV::VFNCVTBF16_F_F_W};
+ else if (LT.second.getVectorElementType() == MVT::f16 &&
+ !ST->hasVInstructionsF16())
+ Opcodes = {RISCV::VFWCVT_F_F_V, RISCV::VFSQRT_V, RISCV::VFNCVT_F_F_W};
+ else
+ Opcodes = {RISCV::VFSQRT_V};
+ return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
}
break;
}
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
index 9eb06a07f135fa..8ff70453194381 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
@@ -5,14 +5,14 @@
define void @fabs() {
; CHECK-LABEL: 'fabs'
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fabs.bf16(bfloat undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.fabs.f32(float undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
@@ -66,17 +66,29 @@ define void @fabs() {
}
define void @fabs_f16() {
-; CHECK-LABEL: 'fabs_f16'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fabs_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fabs_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call half @llvm.fabs.f16(half undef)
call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
index 446627f6bf3c0e..d3ceb162c13364 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
@@ -5,14 +5,14 @@
define void @sqrt() {
; CHECK-LABEL: 'sqrt'
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.sqrt.bf16(bfloat undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.sqrt.f32(float undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -66,17 +66,29 @@ define void @sqrt() {
}
define void @sqrt_f16() {
-; CHECK-LABEL: 'sqrt_f16'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'sqrt_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'sqrt_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call half @llvm.sqrt.f16(half undef)
call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
|
@llvm/pr-subscribers-backend-risc-v Author: LiqinWeng (LiqinWeng) ChangesFull diff: https://github.com/llvm/llvm-project/pull/118608.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index aed476db1956fa..38e9dfe1e44e96 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1035,21 +1035,41 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
- case Intrinsic::fabs:
+ case Intrinsic::fabs: {
+ auto LT = getTypeLegalizationCost(RetTy);
+ if (ST->hasVInstructions() && LT.second.isVector()) {
+ // lui a0, 8
+ // addi a0, a0, -1
+ // vsetvli a1, zero, e16, m1, ta, ma
+ // vand.vx v8, v8, a0
+ // f16 with zvfhmin and bf16 with zvfhbmin
+ if (LT.second.getVectorElementType() == MVT::bf16 ||
+ (LT.second.getVectorElementType() == MVT::f16 &&
+ !ST->hasVInstructionsF16()))
+ return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
+ CostKind) +
+ 2;
+ else
+ return LT.first *
+ getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
+ }
+ break;
+ }
case Intrinsic::sqrt: {
auto LT = getTypeLegalizationCost(RetTy);
- // TODO: add f16/bf16, bf16 with zvfbfmin && f16 with zvfhmin
if (ST->hasVInstructions() && LT.second.isVector()) {
- unsigned Op;
- switch (ICA.getID()) {
- case Intrinsic::fabs:
- Op = RISCV::VFSGNJX_VV;
- break;
- case Intrinsic::sqrt:
- Op = RISCV::VFSQRT_V;
- break;
- }
- return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
+ SmallVector<unsigned, 3> Opcodes;
+ // f16 with zvfhmin and bf16 with zvfbfmin
+ if (LT.second.getVectorElementType() == MVT::bf16 &&
+ ST->hasVInstructionsBF16Minimal())
+ Opcodes = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFSQRT_V,
+ RISCV::VFNCVTBF16_F_F_W};
+ else if (LT.second.getVectorElementType() == MVT::f16 &&
+ !ST->hasVInstructionsF16())
+ Opcodes = {RISCV::VFWCVT_F_F_V, RISCV::VFSQRT_V, RISCV::VFNCVT_F_F_W};
+ else
+ Opcodes = {RISCV::VFSQRT_V};
+ return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
}
break;
}
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
index 9eb06a07f135fa..8ff70453194381 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-min-max-abs.ll
@@ -5,14 +5,14 @@
define void @fabs() {
; CHECK-LABEL: 'fabs'
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fabs.bf16(bfloat undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.fabs.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.fabs.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.fabs.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.fabs.nxv16bf16(<vscale x 16 x bfloat> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.fabs.f32(float undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
@@ -66,17 +66,29 @@ define void @fabs() {
}
define void @fabs_f16() {
-; CHECK-LABEL: 'fabs_f16'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fabs_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fabs_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fabs.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x half> @llvm.fabs.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x half> @llvm.fabs.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %8 = call <vscale x 8 x half> @llvm.fabs.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %9 = call <vscale x 16 x half> @llvm.fabs.nxv16f16(<vscale x 16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call half @llvm.fabs.f16(half undef)
call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
index 446627f6bf3c0e..d3ceb162c13364 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
@@ -5,14 +5,14 @@
define void @sqrt() {
; CHECK-LABEL: 'sqrt'
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.sqrt.bf16(bfloat undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.sqrt.f32(float undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -66,17 +66,29 @@ define void @sqrt() {
}
define void @sqrt_f16() {
-; CHECK-LABEL: 'sqrt_f16'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'sqrt_f16'
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'sqrt_f16'
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call half @llvm.sqrt.f16(half undef)
call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
|
return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind); | ||
SmallVector<unsigned, 3> Opcodes; | ||
// f16 with zvfhmin and bf16 with zvfbfmin | ||
if (LT.second.getVectorElementType() == MVT::bf16) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to account for <vscale x 32 x f16> and <vscale x 32 x bf16> needing to be split before we can widen it? The widening of those types would produce an LMUL=16 value so we have to split first.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cost of vscale x 16 x f16 is 12, whther cost of vscale x 32 x f16 is 24 and vscale x 64 x f16 is 48????
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
formula:
- Because the SpiltOpcodes == Opcodes, (LT.first - 1 ) * getRISCVInstructionCost(SpiltOpcodes , LT.second, CostKind) + getRISCVInstructionCost(Opcodes, LT.second, CostKind) ----> LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The real issue is that the vfsqrt cost needs to use an f32 type and that type needs to be split if it isn't legal.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It has been custom and has not used f32 type. So I get the LT.first == 1 (LT = getTypeLegalizationCost(RetTy)
), and get LK.first is TypeLegal (TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy)
;)
From the following it seems that the cost calculation is correct??? (cost of vscale x 16 x f16 is 12, whther cost of vscale x 32 x f16 is 24 )
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think 12 is the correct cost for <vscale x 16 x f16>. That cost did not use f32 for the vsqrt.v cost.
The cost for <vscale x 4 x bfloat> should be at least 4 and possibly 5 or 6, not 3. There's a vfwcvt.f.f.v from EEW=16 EMUL=m1 to EEW=32 EMUL=m2. That's either a cost of 1 if the hardware can produce an EMUL=2 result in 1 cycle or its 2 if it takes 2 cycles to produce a wide result.
Then there's a EEW=32 EMUL=2 vfsqrt.v. That should be a cost of 2 since the EMUL is 2.
Then there's a vfncvt.f.f.w from EEW=32 EMUL=2 to EEW=16 EMUL=1. That's either 1 cycle if the hardware can narrow EMUL=2 in 1 cycle. Or it's a cost of 2.
So the total cost is either (1 + 2 + 1) or (2 + 2 + 1) or (1 + 2 + 2) or (2 + 2 + 2).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cost use f32 for the vsqrt.v cost. Split have been cost * 2 in the getTypeLegalizationCost
.
Opcodes = {RISCV::VFWCVT_F_F_V, RISCV::VFSQRT_V, RISCV::VFNCVT_F_F_W}; | ||
else | ||
Opcodes = {RISCV::VFSQRT_V}; | ||
return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think both VFWCVT_F_F_V and VFNCVT_F_F_W should use the widened LMUL type, since presumably we should cost things based on the largest EMUL in the instruction, and that seems to match the spacemit-x60? https://camel-cdr.github.io/rvv-bench-results/bpi_f3/index.html
The easiest thing is probably just to update LT.second's element type to f32 to double the LMUL.
I'm not sure it's worthwhile trying to be fancy and handle it in getRISCVInstructionCost
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I update LT.second's element type to f32 to double the LMUL, pls review
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getRISCVInstructionCost uses the vtype associated with each instruction:
- For vfwcvt.f.f.v, the source is (f16, LMUL1), the destination is (f32, LMUL2), and the vtype is (f16, LMUL).
- For vfncvt.f.f.w, the source is (f32, LMUL2), the destination is (f16, LMUL1), and the vtype is (f16, LMUL).
Because the number of uOps required for these operations varies by hardware and are not defined in the ISA, getRISCVInstructionCost takes only the vtype into account.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to determine the following questions:
- should VFWCVT_F_F_V and VFNCVT_F_F_W use the widened LMUL type?
- should VFWCVT_F_F_V and VFNCVT_F_F_W use the type of f16
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I update all opcode Vtype as the vfsqrt type
✅ With the latest revision this PR passed the undef deprecator. |
9a5f0c1
to
86bd2f7
Compare
if (LT.second.getVectorElementType() == MVT::bf16) { | ||
Opcodes = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFSQRT_V, | ||
RISCV::VFNCVTBF16_F_F_W}; | ||
NVT = TLI->getTypeToPromoteTo(ISD::FSQRT, NVT); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this works for <vscale x 32 x bfloat>. We use setOperationAction(ISD::FSQRT, MVT::v32bf16, Custom)
instead of Promote
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is indeed a problem. Is the cost of <vscale x 32 x bfloat> double the cost of <vscale x 16x bfloat>?
} else if (LT.second.getVectorElementType() == MVT::f16 && | ||
!ST->hasVInstructionsF16()) { | ||
Opcodes = {RISCV::VFWCVT_F_F_V, RISCV::VFSQRT_V, RISCV::VFNCVT_F_F_W}; | ||
NVT = TLI->getTypeToPromoteTo(ISD::FSQRT, NVT); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't work for <vscale x 32 x half>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Only LUML = 8, bf16/f16 are not promoted. Can I handle this scenario alone, or do you have any better suggestions?
NVT = TLI->getTypeToPromoteTo(ISD::FSQRT, NVT); | ||
if (LT.second == MVT::nxv32f16) { | ||
Opcodes = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V, | ||
RISCV::VFSQRT_V, RISCV::VFSQRT_V, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NVT needs to be changed to v16f32.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you help me see the code check? I feel that there is no problem with the code, but the code check fails.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@LiqinWeng Are you trying to compute the cost for ISAs without f16 support as (vfwcvt, nxvf16) + (vfsqrt, nxvf32) + (vfncvt, nxv*f16)? From the current code, it doesn’t seem like that’s what’s being implemented.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Under what circumstances will it occur: (vfwcvt, nxvf16) + (vfsqrt, nxvf32) + (vfncvt, nxv*f16)???I don't quite understand ‘ISAs without f16’? do you means the mattr only support zvfh???
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a vfwcvt.f.f.v from EEW=16 EMUL=m1 to EEW=32 EMUL=m2; Shouldn't the type of vfwcvt be nxvf32, the dest eew is 32
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@LiqinWeng vfwcvt is a widening arithmetic instruction. Based on the spec description, the destination EEW is 2 * SEW
, so the vtype is nxvf16 and the destination is nxvf32.
10.2. Widening Vector Arithmetic Instructions
A few vector arithmetic instructions are defined to be widening operations where the destination vector register group has EEW=2*SEW and EMUL=2*LMUL.
On the other hand, for narrowing arithmetic instruction such as vfncvt
, the destination EEW is SEW
, so both the vtype and the destination are nxvf16.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this the case with this instruction(vfwcvtbf16.f.f.v )?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm.. I think vfwcvtbf16.f.f.v is the same with vfwcvt.f.f.v.
4.4. vfwcvtbf16.f.f.v
Synopsis
Vector convert BF16 to FP32
Description
This instruction is similar to vfwcvt.f.f.v which converts a floating-point value in an SEW-width
format into a 2*SEW-width format. However, here the SEW-width format is limited to BF16.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
Opcodes = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V, | ||
RISCV::VFSQRT_V, RISCV::VFSQRT_V, | ||
RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W}; | ||
NVT = TLI->getTypeToPromoteTo(ISD::FSQRT, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can't use getTypeToPromoteTo. That only works when LegalizeVectorOps/LegalizeDAG does the promotion via setOperationAction(ISD::FSQRT, VT, Promote)
. This case uses setOperationAction(ISD::FSQRT, MVT::nxv32f16, Custom)
and there is custom code in RISCVISelLowering.cpp to do the splitting. You'll need to hardcode the type has MVT::nxv16f32.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
if (LT.second == MVT::nxv32bf16) { | ||
ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V, | ||
RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W}; | ||
FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V}; | ||
ConvType = MVT::nxv16f16; | ||
FsqrtType = MVT::nxv16f32; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To handle nxv32bf16, you need 2 iterations of (vfwcvt nxv16bf16), (vfsqrt nxv16f32), and (vfncvt nxv16bf16).
The cost could therefore be LT.first * 2 * (vfwcvt + vfsqrt + vfncvt).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the ConvOp have been spilt, so neednt * 2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I see now. I overlooked the duplicated opcode.
if (LT.second == MVT::nxv32f16) { | ||
ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V, | ||
RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W}; | ||
FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V}; | ||
ConvType = MVT::nxv16f16; | ||
FsqrtType = MVT::nxv16f32; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the ConvOp have been spilt, so neednt * 2 , same as FsqrtOp
} else { | ||
ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W}; | ||
FsqrtOp = {RISCV::VFSQRT_V}; | ||
FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure the reason getTypeToPromoteTo is needed here and below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It converts the f16 type to the f32 type as long as there is a setOperationAction for ISD::FSQRT call in RISCVISelLowering.cpp with Promote
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Understood. Thanks for the explanation!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
No description provided.