@@ -2491,6 +2491,24 @@ void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
2491
2491
State.set (this , DataPhi, Part);
2492
2492
}
2493
2493
2494
+ InstructionCost VPCSAHeaderPHIRecipe::computeCost (ElementCount VF,
2495
+ VPCostContext &Ctx) const {
2496
+ if (VF.isScalar ())
2497
+ return 0 ;
2498
+
2499
+ InstructionCost C = 0 ;
2500
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2501
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2502
+
2503
+ // FIXME: These costs should be moved into VPInstruction::computeCost. We put
2504
+ // them here for now since there is no VPInstruction::computeCost support.
2505
+ // CSAInitMask
2506
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VTy);
2507
+ // CSAInitData
2508
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, VTy);
2509
+ return C;
2510
+ }
2511
+
2494
2512
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2495
2513
void VPCSADataUpdateRecipe::print (raw_ostream &O, const Twine &Indent,
2496
2514
VPSlotTracker &SlotTracker) const {
@@ -2519,6 +2537,34 @@ void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
2519
2537
}
2520
2538
}
2521
2539
2540
+ InstructionCost VPCSADataUpdateRecipe::computeCost (ElementCount VF,
2541
+ VPCostContext &Ctx) const {
2542
+ if (VF.isScalar ())
2543
+ return 0 ;
2544
+
2545
+ InstructionCost C = 0 ;
2546
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2547
+ auto *MaskTy = VectorType::get (IntegerType::getInt1Ty (VTy->getContext ()), VF);
2548
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2549
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2550
+
2551
+ // Data Update
2552
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2553
+
2554
+ // FIXME: These costs should be moved into VPInstruction::computeCost. We put
2555
+ // them here for now since they are related to updating the data and there is
2556
+ // no VPInstruction::computeCost support at the moment. CSAInitMask AnyActive
2557
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2558
+ // vp.reduce.or
2559
+ C += TTI.getArithmeticReductionCost (Instruction::Or, VTy, std::nullopt,
2560
+ CostKind);
2561
+ // VPVLSel
2562
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy, CostKind);
2563
+ // MaskUpdate
2564
+ C += TTI.getArithmeticInstrCost (Instruction::Select, MaskTy, CostKind);
2565
+ return C;
2566
+ }
2567
+
2522
2568
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2523
2569
void VPCSAExtractScalarRecipe::print (raw_ostream &O, const Twine &Indent,
2524
2570
VPSlotTracker &SlotTracker) const {
@@ -2579,6 +2625,60 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
2579
2625
State.set (this , ChooseFromVecOrInit, 0 , /* IsScalar=*/ true );
2580
2626
}
2581
2627
2628
+ InstructionCost
2629
+ VPCSAExtractScalarRecipe::computeCost (ElementCount VF,
2630
+ VPCostContext &Ctx) const {
2631
+ if (VF.isScalar ())
2632
+ return 0 ;
2633
+
2634
+ InstructionCost C = 0 ;
2635
+ auto *VTy = VectorType::get (getUnderlyingValue ()->getType (), VF);
2636
+ auto *Int32VTy =
2637
+ VectorType::get (IntegerType::getInt32Ty (VTy->getContext ()), VF);
2638
+ auto *MaskTy = VectorType::get (IntegerType::getInt1Ty (VTy->getContext ()), VF);
2639
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2640
+ const TargetTransformInfo &TTI = Ctx.TTI ;
2641
+
2642
+ // StepVector
2643
+ ArrayRef<Value *> Args;
2644
+ IntrinsicCostAttributes CostAttrs (Intrinsic::stepvector, Int32VTy, Args);
2645
+ C += TTI.getIntrinsicInstrCost (CostAttrs, CostKind);
2646
+ // NegOneSplat
2647
+ C += TTI.getShuffleCost (TargetTransformInfo::SK_Broadcast, Int32VTy);
2648
+ // LastIdx
2649
+ if (usesEVL ()) {
2650
+ C += TTI.getMinMaxReductionCost (Intrinsic::smax, Int32VTy, FastMathFlags (),
2651
+ CostKind);
2652
+ } else {
2653
+ // ActiveLaneIdxs
2654
+ C += TTI.getArithmeticInstrCost (Instruction::Select,
2655
+ MaskTy->getScalarType (), CostKind);
2656
+ // MaybeLastIdx
2657
+ C += TTI.getMinMaxReductionCost (Intrinsic::smax, Int32VTy, FastMathFlags (),
2658
+ CostKind);
2659
+ // IsLaneZeroActive
2660
+ C += TTI.getArithmeticInstrCost (Instruction::ExtractElement, MaskTy,
2661
+ CostKind);
2662
+ // MaybeLastIdxEQZero
2663
+ C += TTI.getArithmeticInstrCost (Instruction::ICmp, MaskTy->getScalarType (),
2664
+ CostKind);
2665
+ // And
2666
+ C += TTI.getArithmeticInstrCost (Instruction::And, MaskTy->getScalarType (),
2667
+ CostKind);
2668
+ // LastIdx
2669
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy->getScalarType (),
2670
+ CostKind);
2671
+ }
2672
+ // ExtractFromVec
2673
+ C += TTI.getArithmeticInstrCost (Instruction::ExtractElement, VTy, CostKind);
2674
+ // LastIdxGeZero
2675
+ C += TTI.getArithmeticInstrCost (Instruction::ICmp, Int32VTy, CostKind);
2676
+ // ChooseFromVecOrInit
2677
+ C += TTI.getArithmeticInstrCost (Instruction::Select, VTy->getScalarType (),
2678
+ CostKind);
2679
+ return C;
2680
+ }
2681
+
2582
2682
void VPBranchOnMaskRecipe::execute (VPTransformState &State) {
2583
2683
assert (State.Lane && " Branch on Mask works only on single instance." );
2584
2684
0 commit comments