@@ -2496,6 +2496,24 @@ void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
2496
2496
State.set (this , DataPhi, Part);
2497
2497
}
2498
2498
2499
+ InstructionCost VPCSAHeaderPHIRecipe::computeCost (ElementCount VF,
2500
+ VPCostContext &Ctx) const {
2501
+ if (VF.isScalar ())
2502
+ return 0 ;
2503
+
2504
+ InstructionCost C = 0 ;
2505
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2506
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2507
+
2508
+ // FIXME: These costs should be moved into VPInstruction::computeCost. We put
2509
+ // them here for now since there is no VPInstruction::computeCost support.
2510
+ // CSAInitMask
2511
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VTy);
2512
+ // CSAInitData
2513
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VTy);
2514
+ return C;
2515
+ }
2516
+
2499
2517
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2500
2518
void VPCSADataUpdateRecipe::print (raw_ostream &O, const Twine &Indent,
2501
2519
VPSlotTracker &SlotTracker) const {
@@ -2524,6 +2542,34 @@ void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
2524
2542
}
2525
2543
}
2526
2544
2545
+ InstructionCost VPCSADataUpdateRecipe::computeCost (ElementCount VF,
2546
+ VPCostContext &Ctx) const {
2547
+ if (VF.isScalar ())
2548
+ return 0 ;
2549
+
2550
+ InstructionCost C = 0 ;
2551
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2552
+ auto *MaskTy = VectorType::get (IntegerType::getInt1Ty (VTy->getContext ()), VF);
2553
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2554
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2555
+
2556
+ // Data Update
2557
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2558
+
2559
+ // FIXME: These costs should be moved into VPInstruction::computeCost. We put
2560
+ // them here for now since they are related to updating the data and there is
2561
+ // no VPInstruction::computeCost support at the moment. CSAInitMask AnyActive
2562
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2563
+ // vp.reduce.or
2564
+ C += TTI.getArithmeticReductionCost (Instruction::Or, VTy, std::nullopt,
2565
+ CostKind);
2566
+ // VPVLSel
2567
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2568
+ // MaskUpdate
2569
+ C += TTI.getArithmeticInstrCost (Instruction::Select, MaskTy, CostKind);
2570
+ return C;
2571
+ }
2572
+
2527
2573
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2528
2574
void VPCSAExtractScalarRecipe::print (raw_ostream &O, const Twine &Indent,
2529
2575
VPSlotTracker &SlotTracker) const {
@@ -2584,6 +2630,60 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
2584
2630
State.set (this , ChooseFromVecOrInit, 0 , /* IsScalar=*/ true );
2585
2631
}
2586
2632
2633
+ InstructionCost
2634
+ VPCSAExtractScalarRecipe::computeCost (ElementCount VF,
2635
+ VPCostContext &Ctx) const {
2636
+ if (VF.isScalar ())
2637
+ return 0 ;
2638
+
2639
+ InstructionCost C = 0 ;
2640
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2641
+ auto *Int32VTy =
2642
+ VectorType::get (IntegerType::getInt32Ty (VTy->getContext ()), VF);
2643
+ auto *MaskTy = VectorType::get (IntegerType::getInt1Ty (VTy->getContext ()), VF);
2644
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2645
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2646
+
2647
+ // StepVector
2648
+ ArrayRef<Value *> Args;
2649
+ IntrinsicCostAttributes CostAttrs (Intrinsic::stepvector, Int32VTy, Args);
2650
+ C += TTI.getIntrinsicInstrCost (CostAttrs, CostKind);
2651
+ // NegOneSplat
2652
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, Int32VTy);
2653
+ // LastIdx
2654
+ if (usesEVL ()) {
2655
+ C += TTI.getMinMaxReductionCost (Intrinsic::smax, Int32VTy, FastMathFlags (),
2656
+ CostKind);
2657
+ } else {
2658
+ // ActiveLaneIdxs
2659
+ C += TTI.getArithmeticInstrCost (Instruction::Select,
2660
+ MaskTy->getScalarType (), CostKind);
2661
+ // MaybeLastIdx
2662
+ C += TTI.getMinMaxReductionCost (Intrinsic::smax, Int32VTy, FastMathFlags (),
2663
+ CostKind);
2664
+ // IsLaneZeroActive
2665
+ C += TTI.getArithmeticInstrCost (Instruction::ExtractElement, MaskTy,
2666
+ CostKind);
2667
+ // MaybeLastIdxEQZero
2668
+ C += TTI.getArithmeticInstrCost (Instruction::ICmp, MaskTy->getScalarType (),
2669
+ CostKind);
2670
+ // And
2671
+ C += TTI.getArithmeticInstrCost (Instruction::And, MaskTy->getScalarType (),
2672
+ CostKind);
2673
+ // LastIdx
2674
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy->getScalarType (),
2675
+ CostKind);
2676
+ }
2677
+ // ExtractFromVec
2678
+ C += TTI.getArithmeticInstrCost (Instruction::ExtractElement, VTy, CostKind);
2679
+ // LastIdxGeZero
2680
+ C += TTI.getArithmeticInstrCost (Instruction::ICmp, Int32VTy, CostKind);
2681
+ // ChooseFromVecOrInit
2682
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy->getScalarType (),
2683
+ CostKind);
2684
+ return C;
2685
+ }
2686
+
2587
2687
void VPBranchOnMaskRecipe::execute (VPTransformState &State) {
2588
2688
assert (State.Lane && " Branch on Mask works only on single instance." );
2589
2689
0 commit comments