Skip to content

Commit ecf327f

Browse files
jacquesguanjacquesguan
authored andcommitted
[RISCV] Add cost model for vector insert/extract element.
This patch adds cost model for vector insert/extract element instructions. In RVV, we could use vector scalar move instruction to insert or extract the first element, and use vslide to move it. But for mask vector or i64 vector in i32 target, we need special instructions to make it. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D133007
1 parent e2632fb commit ecf327f

16 files changed

+925
-729
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,99 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
765765
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
766766
}
767767

768+
InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
769+
unsigned Index) {
770+
assert(Val->isVectorTy() && "This must be a vector type");
771+
772+
if (Opcode != Instruction::ExtractElement &&
773+
Opcode != Instruction::InsertElement)
774+
return BaseT::getVectorInstrCost(Opcode, Val, Index);
775+
776+
// Legalize the type.
777+
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
778+
779+
// This type is legalized to a scalar type.
780+
if (!LT.second.isVector())
781+
return 0;
782+
783+
// For unsupported scalable vector.
784+
if (LT.second.isScalableVector() && !LT.first.isValid())
785+
return LT.first;
786+
787+
if (!isTypeLegal(Val))
788+
return BaseT::getVectorInstrCost(Opcode, Val, Index);
789+
790+
// In RVV, we could use vslidedown + vmv.x.s to extract element from vector
791+
// and vslideup + vmv.s.x to insert element to vector.
792+
unsigned BaseCost = 1;
793+
// When insertelement we should add the index with 1 as the input of vslideup.
794+
unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
795+
796+
if (Index != -1U) {
797+
// The type may be split. For fixed-width vectors we can normalize the
798+
// index to the new type.
799+
if (LT.second.isFixedLengthVector()) {
800+
unsigned Width = LT.second.getVectorNumElements();
801+
Index = Index % Width;
802+
}
803+
804+
// We could extract/insert the first element without vslidedown/vslideup.
805+
if (Index == 0)
806+
SlideCost = 0;
807+
else if (Opcode == Instruction::InsertElement)
808+
SlideCost = 1; // With a constant index, we do not need to use addi.
809+
}
810+
811+
// Mask vector extract/insert element is different from normal case.
812+
if (Val->getScalarSizeInBits() == 1) {
813+
// For extractelement, we need the following instructions:
814+
// vmv.v.i v8, 0
815+
// vmerge.vim v8, v8, 1, v0
816+
// vsetivli zero, 1, e8, m2, ta, mu (not count)
817+
// vslidedown.vx v8, v8, a0
818+
// vmv.x.s a0, v8
819+
820+
// For insertelement, we need the following instructions:
821+
// vsetvli a2, zero, e8, m1, ta, mu (not count)
822+
// vmv.s.x v8, a0
823+
// vmv.v.i v9, 0
824+
// vmerge.vim v9, v9, 1, v0
825+
// addi a0, a1, 1
826+
// vsetvli zero, a0, e8, m1, tu, mu (not count)
827+
// vslideup.vx v9, v8, a1
828+
// vsetvli a0, zero, e8, m1, ta, mu (not count)
829+
// vand.vi v8, v9, 1
830+
// vmsne.vi v0, v8, 0
831+
832+
// TODO: should we count these special vsetvlis?
833+
BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
834+
}
835+
// Extract i64 in the target that has XLEN=32 need more instruction.
836+
if (Val->getScalarType()->isIntegerTy() &&
837+
ST->getXLen() < Val->getScalarSizeInBits()) {
838+
// For extractelement, we need the following instructions:
839+
// vsetivli zero, 1, e64, m1, ta, mu (not count)
840+
// vslidedown.vx v8, v8, a0
841+
// vmv.x.s a0, v8
842+
// li a1, 32
843+
// vsrl.vx v8, v8, a1
844+
// vmv.x.s a1, v8
845+
846+
// For insertelement, we need the following instructions:
847+
// vsetivli zero, 2, e32, m4, ta, mu (not count)
848+
// vmv.v.i v12, 0
849+
// vslide1up.vx v16, v12, a1
850+
// vslide1up.vx v12, v16, a0
851+
// addi a0, a2, 1
852+
// vsetvli zero, a0, e64, m4, tu, mu (not count)
853+
// vslideup.vx v8, v12, a2
854+
855+
// TODO: should we count these special vsetvlis?
856+
BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
857+
}
858+
return BaseCost + SlideCost;
859+
}
860+
768861
void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
769862
TTI::UnrollingPreferences &UP,
770863
OptimizationRemarkEmitter *ORE) {

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
145145
TTI::TargetCostKind CostKind,
146146
const Instruction *I = nullptr);
147147

148+
using BaseT::getVectorInstrCost;
149+
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
150+
unsigned Index);
151+
148152
bool isElementTypeLegalForScalableVector(Type *Ty) const {
149153
return TLI->isLegalElementTypeForRVV(Ty);
150154
}

llvm/test/Analysis/CostModel/RISCV/arith-fp.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -329,9 +329,9 @@ define i32 @frem() {
329329
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
330330
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
331331
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
332-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
333-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
334-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
332+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F16 = frem <2 x half> undef, undef
333+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F16 = frem <4 x half> undef, undef
334+
; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F16 = frem <8 x half> undef, undef
335335
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F16 = frem <16 x half> undef, undef
336336
; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32F16 = frem <32 x half> undef, undef
337337
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
@@ -341,8 +341,8 @@ define i32 @frem() {
341341
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
342342
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
343343
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef
344-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = frem <2 x float> undef, undef
345-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = frem <4 x float> undef, undef
344+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = frem <2 x float> undef, undef
345+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = frem <4 x float> undef, undef
346346
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F32 = frem <8 x float> undef, undef
347347
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F32 = frem <16 x float> undef, undef
348348
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV1F32 = frem <vscale x 1 x float> undef, undef
@@ -351,7 +351,7 @@ define i32 @frem() {
351351
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV8F32 = frem <vscale x 8 x float> undef, undef
352352
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV16F32 = frem <vscale x 16 x float> undef, undef
353353
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = frem <1 x double> undef, undef
354-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
354+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = frem <2 x double> undef, undef
355355
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
356356
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef
357357
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NXV1F64 = frem <vscale x 1 x double> undef, undef

llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,33 +44,33 @@ define i32 @masked_gather() {
4444
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> undef, i32 1, <1 x i1> undef, <1 x i8> undef)
4545
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 2, <8 x i1> undef, <8 x double> undef)
4646
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 2, <4 x i1> undef, <4 x double> undef)
47-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 2, <2 x i1> undef, <2 x double> undef)
47+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 2, <2 x i1> undef, <2 x double> undef)
4848
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 2, <1 x i1> undef, <1 x double> undef)
4949
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 2, <16 x i1> undef, <16 x float> undef)
5050
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 2, <8 x i1> undef, <8 x float> undef)
51-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 2, <4 x i1> undef, <4 x float> undef)
52-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 2, <2 x i1> undef, <2 x float> undef)
51+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 2, <4 x i1> undef, <4 x float> undef)
52+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 2, <2 x i1> undef, <2 x float> undef)
5353
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> undef, i32 2, <1 x i1> undef, <1 x float> undef)
5454
; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*> undef, i32 1, <32 x i1> undef, <32 x half> undef)
5555
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 1, <16 x i1> undef, <16 x half> undef)
56-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 1, <8 x i1> undef, <8 x half> undef)
57-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 1, <4 x i1> undef, <4 x half> undef)
58-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 1, <2 x i1> undef, <2 x half> undef)
56+
; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 1, <8 x i1> undef, <8 x half> undef)
57+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 1, <4 x i1> undef, <4 x half> undef)
58+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 1, <2 x i1> undef, <2 x half> undef)
5959
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> undef, i32 1, <1 x i1> undef, <1 x half> undef)
6060
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 4, <8 x i1> undef, <8 x i64> undef)
6161
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
62-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
62+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
6363
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 4, <1 x i1> undef, <1 x i64> undef)
6464
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
6565
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
66-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
67-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
66+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
67+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
6868
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> undef, i32 1, <1 x i1> undef, <1 x i32> undef)
6969
; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
7070
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
71-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
72-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
73-
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
71+
; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
72+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
73+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
7474
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> undef, i32 1, <1 x i1> undef, <1 x i16> undef)
7575
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
7676
;

0 commit comments

Comments
 (0)