@@ -1388,3 +1388,86 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
1388
1388
1389
1389
return false ;
1390
1390
}
1391
+
1392
+ bool PPCTTIImpl::hasActiveVectorLength (unsigned Opcode, Type *DataType,
1393
+ Align Alignment) const {
1394
+ // Only load and stores instructions can have variable vector length on Power.
1395
+ if (Opcode != Instruction::Load && Opcode != Instruction::Store)
1396
+ return false ;
1397
+ // Loads/stores with length instructions use bits 0-7 of the GPR operand and
1398
+ // therefore cannot be used in 32-bit mode.
1399
+ if ((!ST->hasP9Vector () && !ST->hasP10Vector ()) || !ST->isPPC64 ())
1400
+ return false ;
1401
+ if (auto *VecTy = dyn_cast<FixedVectorType>(DataType)) {
1402
+ unsigned VecWidth = DataType->getPrimitiveSizeInBits ();
1403
+ return VecWidth == 128 ;
1404
+ }
1405
+ Type *ScalarTy = DataType->getScalarType ();
1406
+
1407
+ if (ScalarTy->isPointerTy ())
1408
+ return true ;
1409
+
1410
+ if (ScalarTy->isFloatTy () || ScalarTy->isDoubleTy ())
1411
+ return true ;
1412
+
1413
+ if (!ScalarTy->isIntegerTy ())
1414
+ return false ;
1415
+
1416
+ unsigned IntWidth = ScalarTy->getIntegerBitWidth ();
1417
+ return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64 ;
1418
+ }
1419
+
1420
+ InstructionCost PPCTTIImpl::getVPMemoryOpCost (unsigned Opcode, Type *Src,
1421
+ Align Alignment,
1422
+ unsigned AddressSpace,
1423
+ TTI::TargetCostKind CostKind,
1424
+ const Instruction *I) {
1425
+ InstructionCost Cost = BaseT::getVPMemoryOpCost (Opcode, Src, Alignment,
1426
+ AddressSpace, CostKind, I);
1427
+ if (TLI->getValueType (DL, Src, true ) == MVT::Other)
1428
+ return Cost;
1429
+ // TODO: Handle other cost kinds.
1430
+ if (CostKind != TTI::TCK_RecipThroughput)
1431
+ return Cost;
1432
+
1433
+ assert ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1434
+ " Invalid Opcode" );
1435
+
1436
+ auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
1437
+ assert (SrcVTy && " Expected a vector type for VP memory operations" );
1438
+
1439
+ if (hasActiveVectorLength (Opcode, Src, Alignment)) {
1440
+ std::pair<InstructionCost, MVT> LT =
1441
+ TLI->getTypeLegalizationCost (DL, SrcVTy);
1442
+
1443
+ InstructionCost CostFactor =
1444
+ vectorCostAdjustmentFactor (Opcode, Src, nullptr );
1445
+ if (!CostFactor.isValid ())
1446
+ return InstructionCost::getMax ();
1447
+
1448
+ InstructionCost Cost = LT.first * CostFactor;
1449
+ assert (Cost.isValid () && " Expected valid cost" );
1450
+
1451
+ // On P9 but not on P10, if the op is misaligned then it will cause a
1452
+ // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
1453
+ // ones.
1454
+ const Align DesiredAlignment (16 );
1455
+ if (Alignment >= DesiredAlignment || ST->getCPUDirective () != PPC::DIR_PWR9)
1456
+ return Cost;
1457
+
1458
+ // Since alignment may be under estimated, we try to compute the probability
1459
+ // that the actual address is aligned to the desired boundary. For example
1460
+ // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
1461
+ // time, while a 4-byte aligned load has a 25% chance of being 16-byte
1462
+ // aligned.
1463
+ float AlignmentProb = ((float )Alignment.value ()) / DesiredAlignment.value ();
1464
+ float MisalignmentProb = 1.0 - AlignmentProb;
1465
+ return (MisalignmentProb * P9PipelineFlushEstimate) +
1466
+ (AlignmentProb * *Cost.getValue ());
1467
+ }
1468
+
1469
+ // Usually we should not get to this point, but the following is an attempt to
1470
+ // model the cost of legalization. Currently we can only lower intrinsics with
1471
+ // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
1472
+ return getMaskedMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind);
1473
+ }
0 commit comments