llvm
diff --git a/‎llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Lines changed: 93 additions & 0 deletions b/‎llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Lines changed: 93 additions & 0 deletions
diff --git a/‎llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
Lines changed: 6 additions & 6 deletions b/‎llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
Lines changed: 6 additions & 6 deletions
diff --git a/‎llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
Lines changed: 12 additions & 12 deletions b/‎llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
Lines changed: 12 additions & 12 deletions
@@ -765,6 +765,99 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
 }
 
+InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                 unsigned Index) {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Opcode != Instruction::ExtractElement &&
+      Opcode != Instruction::InsertElement)
+    return BaseT::getVectorInstrCost(Opcode, Val, Index);
+
+  // Legalize the type.
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
+
+  // This type is legalized to a scalar type.
+  if (!LT.second.isVector())
+    return 0;
+
+  // For unsupported scalable vector.
+  if (LT.second.isScalableVector() && !LT.first.isValid())
+    return LT.first;
+
+  if (!isTypeLegal(Val))
+    return BaseT::getVectorInstrCost(Opcode, Val, Index);
+
+  // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
+  // and vslideup + vmv.s.x to insert element to vector.
+  unsigned BaseCost = 1;
+  // When insertelement we should add the index with 1 as the input of vslideup.
+  unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
+
+  if (Index != -1U) {
+    // The type may be split. For fixed-width vectors we can normalize the
+    // index to the new type.
+    if (LT.second.isFixedLengthVector()) {
+      unsigned Width = LT.second.getVectorNumElements();
+      Index = Index % Width;
+    }
+
+    // We could extract/insert the first element without vslidedown/vslideup.
+    if (Index == 0)
+      SlideCost = 0;
+    else if (Opcode == Instruction::InsertElement)
+      SlideCost = 1; // With a constant index, we do not need to use addi.
+  }
+
+  // Mask vector extract/insert element is different from normal case.
+  if (Val->getScalarSizeInBits() == 1) {
+    // For extractelement, we need the following instructions:
+    // vmv.v.i v8, 0
+    // vmerge.vim v8, v8, 1, v0
+    // vsetivli zero, 1, e8, m2, ta, mu (not count)
+    // vslidedown.vx v8, v8, a0
+    // vmv.x.s a0, v8
+
+    // For insertelement, we need the following instructions:
+    // vsetvli a2, zero, e8, m1, ta, mu (not count)
+    // vmv.s.x v8, a0
+    // vmv.v.i v9, 0
+    // vmerge.vim v9, v9, 1, v0
+    // addi a0, a1, 1
+    // vsetvli zero, a0, e8, m1, tu, mu (not count)
+    // vslideup.vx v9, v8, a1
+    // vsetvli a0, zero, e8, m1, ta, mu (not count)
+    // vand.vi v8, v9, 1
+    // vmsne.vi v0, v8, 0
+
+    // TODO: should we count these special vsetvlis?
+    BaseCost = Opcode == Instruction::InsertElement ? 5 : 3;
+  }
+  // Extract i64 in the target that has XLEN=32 need more instruction.
+  if (Val->getScalarType()->isIntegerTy() &&
+      ST->getXLen() < Val->getScalarSizeInBits()) {
+    // For extractelement, we need the following instructions:
+    // vsetivli zero, 1, e64, m1, ta, mu (not count)
+    // vslidedown.vx v8, v8, a0
+    // vmv.x.s a0, v8
+    // li a1, 32
+    // vsrl.vx v8, v8, a1
+    // vmv.x.s a1, v8
+
+    // For insertelement, we need the following instructions:
+    // vsetivli zero, 2, e32, m4, ta, mu (not count)
+    // vmv.v.i v12, 0
+    // vslide1up.vx v16, v12, a1
+    // vslide1up.vx v12, v16, a0
+    // addi a0, a2, 1
+    // vsetvli zero, a0, e64, m4, tu, mu (not count)
+    // vslideup.vx v8, v12, a2
+
+    // TODO: should we count these special vsetvlis?
+    BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
+  }
+  return BaseCost + SlideCost;
+}
+
 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::UnrollingPreferences &UP,
                                            OptimizationRemarkEmitter *ORE) {
 
@@ -145,6 +145,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
                                      TTI::TargetCostKind CostKind,
                                      const Instruction *I = nullptr);
 
+  using BaseT::getVectorInstrCost;
+  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                     unsigned Index);
+
   bool isElementTypeLegalForScalableVector(Type *Ty) const {
     return TLI->isLegalElementTypeForRVV(Ty);
   }
 
@@ -329,9 +329,9 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16 = frem <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F16 = frem <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F16 = frem <8 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F16 = frem <16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32F16 = frem <32 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
@@ -341,8 +341,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = frem <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = frem <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = frem <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = frem <4 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F32 = frem <8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F32 = frem <vscale x 1 x float> undef, undef
@@ -351,7 +351,7 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F32 = frem <vscale x 8 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F32 = frem <vscale x 16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = frem <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = frem <2 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F64 = frem <vscale x 1 x double> undef, undef
 
@@ -44,33 +44,33 @@ define i32 @masked_gather() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> undef, i32 1, <1 x i1> undef, <1 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 2, <8 x i1> undef, <8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 2, <4 x i1> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 2, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 2, <2 x i1> undef, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 2, <1 x i1> undef, <1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 2, <16 x i1> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 2, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 2, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 2, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 2, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 2, <2 x i1> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> undef, i32 2, <1 x i1> undef, <1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*> undef, i32 1, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 1, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 1, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 1, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 1, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 1, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 1, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 1, <2 x i1> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> undef, i32 1, <1 x i1> undef, <1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 4, <8 x i1> undef, <8 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 4, <1 x i1> undef, <1 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> undef, i32 1, <1 x i1> undef, <1 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> undef, i32 1, <1 x i1> undef, <1 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
 ;