@@ -5491,12 +5491,16 @@ static bool isMaskedLoadCompress(
5491
5491
const unsigned Sz = VL.size();
5492
5492
auto *VecTy = getWidenedType(ScalarTy, Sz);
5493
5493
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5494
+ SmallVector<int> Mask;
5495
+ if (!Order.empty())
5496
+ inversePermutation(Order, Mask);
5494
5497
// Check external uses.
5495
5498
for (const auto [I, V] : enumerate(VL)) {
5496
5499
if (AreAllUsersVectorized(V))
5497
5500
continue;
5498
5501
InstructionCost ExtractCost =
5499
- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
5502
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
5503
+ Mask.empty() ? I : Mask[I]);
5500
5504
InstructionCost ScalarCost =
5501
5505
TTI.getInstructionCost(cast<Instruction>(V), CostKind);
5502
5506
if (ExtractCost <= ScalarCost)
@@ -5536,8 +5540,11 @@ static bool isMaskedLoadCompress(
5536
5540
bool IsStrided =
5537
5541
buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
5538
5542
assert(CompressMask.size() >= 2 && "At least two elements are required");
5543
+ SmallVector<Value *> OrderedPointerOps(PointerOps);
5544
+ if (!Order.empty())
5545
+ reorderScalars(OrderedPointerOps, Mask);
5539
5546
auto [ScalarGEPCost, VectorGEPCost] =
5540
- getGEPCosts(TTI, PointerOps, PointerOps .front(),
5547
+ getGEPCosts(TTI, OrderedPointerOps, OrderedPointerOps .front(),
5541
5548
Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
5542
5549
// The cost of scalar loads.
5543
5550
InstructionCost ScalarLoadsCost =
@@ -5564,17 +5571,16 @@ static bool isMaskedLoadCompress(
5564
5571
TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
5565
5572
LI->getPointerAddressSpace(), CostKind);
5566
5573
}
5567
- SmallVector<int> Mask;
5568
- if (!Order.empty())
5569
- inversePermutation(Order, Mask);
5570
5574
if (IsStrided) {
5571
5575
// Check for potential segmented(interleaved) loads.
5572
5576
if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1],
5573
5577
CommonAlignment,
5574
5578
LI->getPointerAddressSpace())) {
5575
- InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
5576
- Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt,
5577
- CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked);
5579
+ InstructionCost InterleavedCost =
5580
+ VectorGEPCost + TTI.getInterleavedMemoryOpCost(
5581
+ Instruction::Load, LoadVecTy, CompressMask[1],
5582
+ std::nullopt, CommonAlignment,
5583
+ LI->getPointerAddressSpace(), CostKind, IsMasked);
5578
5584
if (!Mask.empty())
5579
5585
InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
5580
5586
VecTy, Mask, CostKind);
0 commit comments