Skip to content

Commit df93c8a

Browse files
committed
[X86] X86TTIImpl::getInterleavedMemoryOpCostAVX512(): fallback to scalarization cost computation for mask
I don't really buy that masked interleaved memory loads/stores are supported on X86. There is zero costmodel test coverage, no actual cost modelling for the generation of the mask repetition, and basically only two LV tests. Additionally, i'm not very interested in AVX512. I don't know if this really helps "soft" block over at https://reviews.llvm.org/D111460#inline-1075467, but i think it can't make things worse at least. When we are being told that there is a masking, instead of completely giving up and falling back to fully scalarizing `BasicTTIImplBase::getInterleavedMemoryOpCost()`, let's correctly query the cost of masked memory ops, keep all the pretty shuffle cost modelling, but scalarize the cost computation for the mask replication. I think, not scalarizing the shuffles themselves may adjust the computed costs a bit, and maybe hopefully just enough to hide the "regressions" at https://reviews.llvm.org/D111460#inline-1075467 I do mean hide, because the test coverage is non-existent. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D112873
1 parent f3d1ddf commit df93c8a

File tree

2 files changed

+68
-17
lines changed

2 files changed

+68
-17
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5053,12 +5053,60 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
50535053
// Get the cost of one memory operation.
50545054
auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
50555055
LegalVT.getVectorNumElements());
5056-
InstructionCost MemOpCost = getMemoryOpCost(
5057-
Opcode, SingleMemOpTy, MaybeAlign(Alignment), AddressSpace, CostKind);
5056+
InstructionCost MemOpCost;
5057+
if (UseMaskForCond || UseMaskForGaps)
5058+
MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
5059+
AddressSpace, CostKind);
5060+
else
5061+
MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment),
5062+
AddressSpace, CostKind);
50585063

50595064
unsigned VF = VecTy->getNumElements() / Factor;
50605065
MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
50615066

5067+
// FIXME: this is the most conservative estimate for the mask cost.
5068+
InstructionCost MaskCost;
5069+
if (UseMaskForCond || UseMaskForGaps) {
5070+
APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements());
5071+
for (unsigned Index : Indices) {
5072+
assert(Index < Factor && "Invalid index for interleaved memory op");
5073+
for (unsigned Elm = 0; Elm < VF; Elm++)
5074+
DemandedLoadStoreElts.setBit(Index + Elm * Factor);
5075+
}
5076+
5077+
Type *I1Type = Type::getInt1Ty(VecTy->getContext());
5078+
auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements());
5079+
auto *MaskSubVT = FixedVectorType::get(I1Type, VF);
5080+
5081+
// The Mask shuffling cost is extract all the elements of the Mask
5082+
// and insert each of them Factor times into the wide vector:
5083+
//
5084+
// E.g. an interleaved group with factor 3:
5085+
// %mask = icmp ult <8 x i32> %vec1, %vec2
5086+
// %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
5087+
// <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
5088+
// The cost is estimated as extract all mask elements from the <8xi1> mask
5089+
// vector and insert them factor times into the <24xi1> shuffled mask
5090+
// vector.
5091+
MaskCost += getScalarizationOverhead(
5092+
MaskSubVT, APInt::getAllOnes(MaskSubVT->getNumElements()),
5093+
/*Insert*/ false, /*Extract*/ true);
5094+
MaskCost += getScalarizationOverhead(
5095+
MaskVT,
5096+
UseMaskForGaps ? DemandedLoadStoreElts
5097+
: APInt::getAllOnes(VecTy->getNumElements()),
5098+
/*Insert*/ true,
5099+
/*Extract*/ false);
5100+
5101+
// The Gaps mask is invariant and created outside the loop, therefore the
5102+
// cost of creating it is not accounted for here. However if we have both
5103+
// a MaskForGaps and some other mask that guards the execution of the
5104+
// memory access, we need to account for the cost of And-ing the two masks
5105+
// inside the loop.
5106+
if (UseMaskForGaps)
5107+
MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind);
5108+
}
5109+
50625110
if (Opcode == Instruction::Load) {
50635111
// The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
50645112
// contain the cost of the optimized shuffle sequence that the
@@ -5074,7 +5122,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
50745122

50755123
if (const auto *Entry =
50765124
CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
5077-
return NumOfMemOps * MemOpCost + Entry->Cost;
5125+
return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
50785126
//If an entry does not exist, fallback to the default implementation.
50795127

50805128
// Kind of shuffle depends on number of loaded values.
@@ -5111,7 +5159,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
51115159
NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
51125160

51135161
InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
5114-
NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
5162+
MaskCost + NumOfUnfoldedLoads * MemOpCost +
5163+
NumOfMoves;
51155164

51165165
return Cost;
51175166
}
@@ -5133,7 +5182,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
51335182

51345183
if (const auto *Entry =
51355184
CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
5136-
return NumOfMemOps * MemOpCost + Entry->Cost;
5185+
return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost;
51375186
//If an entry does not exist, fallback to the default implementation.
51385187

51395188
// There is no strided stores meanwhile. And store can't be folded in
@@ -5147,6 +5196,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
51475196
// We need additional instructions to keep sources.
51485197
unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
51495198
InstructionCost Cost =
5199+
MaskCost +
51505200
NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
51515201
NumOfMoves;
51525202
return Cost;
@@ -5157,10 +5207,6 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
51575207
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
51585208
bool UseMaskForCond, bool UseMaskForGaps) {
51595209
auto *VecTy = cast<FixedVectorType>(BaseTy);
5160-
if (UseMaskForCond || UseMaskForGaps)
5161-
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5162-
Alignment, AddressSpace, CostKind,
5163-
UseMaskForCond, UseMaskForGaps);
51645210

51655211
auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
51665212
Type *EltTy = cast<VectorType>(VecTy)->getElementType();
@@ -5177,6 +5223,11 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
51775223
Opcode, VecTy, Factor, Indices, Alignment,
51785224
AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
51795225

5226+
if (UseMaskForCond || UseMaskForGaps)
5227+
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5228+
Alignment, AddressSpace, CostKind,
5229+
UseMaskForCond, UseMaskForGaps);
5230+
51805231
// Get estimation for interleaved load/store operations for SSE-AVX2.
51815232
// As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow
51825233
// computing the cost using a generic formula as a function of generic

llvm/test/Analysis/CostModel/X86/interleaved-store-accesses-with-gaps.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,16 @@ target triple = "x86_64-unknown-linux-gnu"
4040
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2
4141
;
4242
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2
43-
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2
43+
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2
4444
;
4545
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2
46-
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2
46+
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 21 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2
4747
;
4848
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2
49-
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 40 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2
49+
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 33 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2
5050
;
5151
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2
52-
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 96 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2
52+
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 66 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2
5353

5454
define void @test1(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) {
5555
entry:
@@ -107,16 +107,16 @@ for.end:
107107
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2
108108
;
109109
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2
110-
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2
110+
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2
111111
;
112112
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2
113-
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 33 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2
113+
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 21 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2
114114
;
115115
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2
116-
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2
116+
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 33 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2
117117
;
118118
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2
119-
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 152 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2
119+
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 66 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2
120120

121121
define void @test2(i16* noalias nocapture %points, i32 %numPoints, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) {
122122
entry:

0 commit comments

Comments
 (0)