@@ -5053,12 +5053,60 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
5053
5053
// Get the cost of one memory operation.
5054
5054
auto *SingleMemOpTy = FixedVectorType::get (VecTy->getElementType (),
5055
5055
LegalVT.getVectorNumElements ());
5056
- InstructionCost MemOpCost = getMemoryOpCost (
5057
- Opcode, SingleMemOpTy, MaybeAlign (Alignment), AddressSpace, CostKind);
5056
+ InstructionCost MemOpCost;
5057
+ if (UseMaskForCond || UseMaskForGaps)
5058
+ MemOpCost = getMaskedMemoryOpCost (Opcode, SingleMemOpTy, Alignment,
5059
+ AddressSpace, CostKind);
5060
+ else
5061
+ MemOpCost = getMemoryOpCost (Opcode, SingleMemOpTy, MaybeAlign (Alignment),
5062
+ AddressSpace, CostKind);
5058
5063
5059
5064
unsigned VF = VecTy->getNumElements () / Factor;
5060
5065
MVT VT = MVT::getVectorVT (MVT::getVT (VecTy->getScalarType ()), VF);
5061
5066
5067
+ // FIXME: this is the most conservative estimate for the mask cost.
5068
+ InstructionCost MaskCost;
5069
+ if (UseMaskForCond || UseMaskForGaps) {
5070
+ APInt DemandedLoadStoreElts = APInt::getZero (VecTy->getNumElements ());
5071
+ for (unsigned Index : Indices) {
5072
+ assert (Index < Factor && " Invalid index for interleaved memory op" );
5073
+ for (unsigned Elm = 0 ; Elm < VF; Elm++)
5074
+ DemandedLoadStoreElts.setBit (Index + Elm * Factor);
5075
+ }
5076
+
5077
+ Type *I1Type = Type::getInt1Ty (VecTy->getContext ());
5078
+ auto *MaskVT = FixedVectorType::get (I1Type, VecTy->getNumElements ());
5079
+ auto *MaskSubVT = FixedVectorType::get (I1Type, VF);
5080
+
5081
+ // The Mask shuffling cost is extract all the elements of the Mask
5082
+ // and insert each of them Factor times into the wide vector:
5083
+ //
5084
+ // E.g. an interleaved group with factor 3:
5085
+ // %mask = icmp ult <8 x i32> %vec1, %vec2
5086
+ // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
5087
+ // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
5088
+ // The cost is estimated as extract all mask elements from the <8xi1> mask
5089
+ // vector and insert them factor times into the <24xi1> shuffled mask
5090
+ // vector.
5091
+ MaskCost += getScalarizationOverhead (
5092
+ MaskSubVT, APInt::getAllOnes (MaskSubVT->getNumElements ()),
5093
+ /* Insert*/ false , /* Extract*/ true );
5094
+ MaskCost += getScalarizationOverhead (
5095
+ MaskVT,
5096
+ UseMaskForGaps ? DemandedLoadStoreElts
5097
+ : APInt::getAllOnes (VecTy->getNumElements ()),
5098
+ /* Insert*/ true ,
5099
+ /* Extract*/ false );
5100
+
5101
+ // The Gaps mask is invariant and created outside the loop, therefore the
5102
+ // cost of creating it is not accounted for here. However if we have both
5103
+ // a MaskForGaps and some other mask that guards the execution of the
5104
+ // memory access, we need to account for the cost of And-ing the two masks
5105
+ // inside the loop.
5106
+ if (UseMaskForGaps)
5107
+ MaskCost += getArithmeticInstrCost (BinaryOperator::And, MaskVT, CostKind);
5108
+ }
5109
+
5062
5110
if (Opcode == Instruction::Load) {
5063
5111
// The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
5064
5112
// contain the cost of the optimized shuffle sequence that the
@@ -5074,7 +5122,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
5074
5122
5075
5123
if (const auto *Entry =
5076
5124
CostTableLookup (AVX512InterleavedLoadTbl, Factor, VT))
5077
- return NumOfMemOps * MemOpCost + Entry->Cost ;
5125
+ return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost ;
5078
5126
// If an entry does not exist, fallback to the default implementation.
5079
5127
5080
5128
// Kind of shuffle depends on number of loaded values.
@@ -5111,7 +5159,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
5111
5159
NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2 ;
5112
5160
5113
5161
InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
5114
- NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
5162
+ MaskCost + NumOfUnfoldedLoads * MemOpCost +
5163
+ NumOfMoves;
5115
5164
5116
5165
return Cost;
5117
5166
}
@@ -5133,7 +5182,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
5133
5182
5134
5183
if (const auto *Entry =
5135
5184
CostTableLookup (AVX512InterleavedStoreTbl, Factor, VT))
5136
- return NumOfMemOps * MemOpCost + Entry->Cost ;
5185
+ return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost ;
5137
5186
// If an entry does not exist, fallback to the default implementation.
5138
5187
5139
5188
// There is no strided stores meanwhile. And store can't be folded in
@@ -5147,6 +5196,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
5147
5196
// We need additional instructions to keep sources.
5148
5197
unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2 ;
5149
5198
InstructionCost Cost =
5199
+ MaskCost +
5150
5200
NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
5151
5201
NumOfMoves;
5152
5202
return Cost;
@@ -5157,10 +5207,6 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
5157
5207
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5158
5208
bool UseMaskForCond, bool UseMaskForGaps) {
5159
5209
auto *VecTy = cast<FixedVectorType>(BaseTy);
5160
- if (UseMaskForCond || UseMaskForGaps)
5161
- return BaseT::getInterleavedMemoryOpCost (Opcode, VecTy, Factor, Indices,
5162
- Alignment, AddressSpace, CostKind,
5163
- UseMaskForCond, UseMaskForGaps);
5164
5210
5165
5211
auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
5166
5212
Type *EltTy = cast<VectorType>(VecTy)->getElementType ();
@@ -5177,6 +5223,11 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
5177
5223
Opcode, VecTy, Factor, Indices, Alignment,
5178
5224
AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
5179
5225
5226
+ if (UseMaskForCond || UseMaskForGaps)
5227
+ return BaseT::getInterleavedMemoryOpCost (Opcode, VecTy, Factor, Indices,
5228
+ Alignment, AddressSpace, CostKind,
5229
+ UseMaskForCond, UseMaskForGaps);
5230
+
5180
5231
// Get estimation for interleaved load/store operations for SSE-AVX2.
5181
5232
// As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
5182
5233
// computing the cost using a generic formula as a function of generic
0 commit comments