@@ -2489,6 +2489,24 @@ void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
2489
2489
State.set (this , DataPhi, Part);
2490
2490
}
2491
2491
2492
+ InstructionCost VPCSAHeaderPHIRecipe::computeCost (ElementCount VF,
2493
+ VPCostContext &Ctx) const {
2494
+ if (VF.isScalar ())
2495
+ return 0 ;
2496
+
2497
+ InstructionCost C = 0 ;
2498
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2499
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2500
+
2501
+ // FIXME: These costs should be moved into VPInstruction::computeCost. We put
2502
+ // them here for now since there is no VPInstruction::computeCost support.
2503
+ // CSAInitMask
2504
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VTy);
2505
+ // CSAInitData
2506
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VTy);
2507
+ return C;
2508
+ }
2509
+
2492
2510
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2493
2511
void VPCSADataUpdateRecipe::print (raw_ostream &O, const Twine &Indent,
2494
2512
VPSlotTracker &SlotTracker) const {
@@ -2517,6 +2535,34 @@ void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
2517
2535
}
2518
2536
}
2519
2537
2538
+ InstructionCost VPCSADataUpdateRecipe::computeCost (ElementCount VF,
2539
+ VPCostContext &Ctx) const {
2540
+ if (VF.isScalar ())
2541
+ return 0 ;
2542
+
2543
+ InstructionCost C = 0 ;
2544
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2545
+ auto *MaskTy = VectorType::get (IntegerType::getInt1Ty (VTy->getContext ()), VF);
2546
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2547
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2548
+
2549
+ // Data Update
2550
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2551
+
2552
+ // FIXME: These costs should be moved into VPInstruction::computeCost. We put
2553
+ // them here for now since they are related to updating the data and there is
2554
+ // no VPInstruction::computeCost support at the moment. CSAInitMask AnyActive
2555
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2556
+ // vp.reduce.or
2557
+ C += TTI.getArithmeticReductionCost (Instruction::Or, VTy, std::nullopt,
2558
+ CostKind);
2559
+ // VPVLSel
2560
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2561
+ // MaskUpdate
2562
+ C += TTI.getArithmeticInstrCost (Instruction::Select, MaskTy, CostKind);
2563
+ return C;
2564
+ }
2565
+
2520
2566
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2521
2567
void VPCSAExtractScalarRecipe::print (raw_ostream &O, const Twine &Indent,
2522
2568
VPSlotTracker &SlotTracker) const {
@@ -2577,6 +2623,60 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
2577
2623
State.set (this , ChooseFromVecOrInit, 0 , /* IsScalar=*/ true );
2578
2624
}
2579
2625
2626
+ InstructionCost
2627
+ VPCSAExtractScalarRecipe::computeCost (ElementCount VF,
2628
+ VPCostContext &Ctx) const {
2629
+ if (VF.isScalar ())
2630
+ return 0 ;
2631
+
2632
+ InstructionCost C = 0 ;
2633
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2634
+ auto *Int32VTy =
2635
+ VectorType::get (IntegerType::getInt32Ty (VTy->getContext ()), VF);
2636
+ auto *MaskTy = VectorType::get (IntegerType::getInt1Ty (VTy->getContext ()), VF);
2637
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2638
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2639
+
2640
+ // StepVector
2641
+ ArrayRef<Value *> Args;
2642
+ IntrinsicCostAttributes CostAttrs (Intrinsic::stepvector, Int32VTy, Args);
2643
+ C += TTI.getIntrinsicInstrCost (CostAttrs, CostKind);
2644
+ // NegOneSplat
2645
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, Int32VTy);
2646
+ // LastIdx
2647
+ if (usesEVL ()) {
2648
+ C += TTI.getMinMaxReductionCost (Intrinsic::smax, Int32VTy, FastMathFlags (),
2649
+ CostKind);
2650
+ } else {
2651
+ // ActiveLaneIdxs
2652
+ C += TTI.getArithmeticInstrCost (Instruction::Select,
2653
+ MaskTy->getScalarType (), CostKind);
2654
+ // MaybeLastIdx
2655
+ C += TTI.getMinMaxReductionCost (Intrinsic::smax, Int32VTy, FastMathFlags (),
2656
+ CostKind);
2657
+ // IsLaneZeroActive
2658
+ C += TTI.getArithmeticInstrCost (Instruction::ExtractElement, MaskTy,
2659
+ CostKind);
2660
+ // MaybeLastIdxEQZero
2661
+ C += TTI.getArithmeticInstrCost (Instruction::ICmp, MaskTy->getScalarType (),
2662
+ CostKind);
2663
+ // And
2664
+ C += TTI.getArithmeticInstrCost (Instruction::And, MaskTy->getScalarType (),
2665
+ CostKind);
2666
+ // LastIdx
2667
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy->getScalarType (),
2668
+ CostKind);
2669
+ }
2670
+ // ExtractFromVec
2671
+ C += TTI.getArithmeticInstrCost (Instruction::ExtractElement, VTy, CostKind);
2672
+ // LastIdxGeZero
2673
+ C += TTI.getArithmeticInstrCost (Instruction::ICmp, Int32VTy, CostKind);
2674
+ // ChooseFromVecOrInit
2675
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy->getScalarType (),
2676
+ CostKind);
2677
+ return C;
2678
+ }
2679
+
2580
2680
void VPBranchOnMaskRecipe::execute (VPTransformState &State) {
2581
2681
assert (State.Lane && " Branch on Mask works only on single instance." );
2582
2682
0 commit comments