Skip to content

Commit ccfe0de

Browse files
authored
[LV]: Teach LV to recursively (de)interleave. (#89018)
Currently available intrinsics are only ld2/st2, which don't support interleaving factor > 2. This patch teaches the LV to use ld2/st2 recursively to support high interleaving factors.
1 parent aa2fdc6 commit ccfe0de

File tree

6 files changed

+1387
-671
lines changed

6 files changed

+1387
-671
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3576,10 +3576,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
35763576
if (hasIrregularType(ScalarTy, DL))
35773577
return false;
35783578

3579-
// We currently only know how to emit interleave/deinterleave with
3580-
// Factor=2 for scalable vectors. This is purely an implementation
3581-
// limit.
3582-
if (VF.isScalable() && InterleaveFactor != 2)
3579+
// For scalable vectors, the only interleave factor currently supported
3580+
// must be power of 2 since we require the (de)interleave2 intrinsics
3581+
// instead of shufflevectors.
3582+
if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
35833583
return false;
35843584

35853585
// If the group involves a non-integral pointer, we may not be able to
@@ -9364,9 +9364,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93649364
CM.getWideningDecision(IG->getInsertPos(), VF) ==
93659365
LoopVectorizationCostModel::CM_Interleave);
93669366
// For scalable vectors, the only interleave factor currently supported
9367-
// is 2 since we require the (de)interleave2 intrinsics instead of
9368-
// shufflevectors.
9369-
assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
9367+
// must be power of 2 since we require the (de)interleave2 intrinsics
9368+
// instead of shufflevectors.
9369+
assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
93709370
"Unsupported interleave factor for scalable vectors");
93719371
return Result;
93729372
};

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 56 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2849,10 +2849,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
28492849
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
28502850
// must use intrinsics to interleave.
28512851
if (VecTy->isScalableTy()) {
2852-
VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
2853-
return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
2854-
Vals,
2855-
/*FMFSource=*/nullptr, Name);
2852+
assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
2853+
"scalable vectors, must be power of 2");
2854+
SmallVector<Value *> InterleavingValues(Vals);
2855+
// When interleaving, the number of values will be shrunk until we have the
2856+
// single final interleaved value.
2857+
auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
2858+
for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
2859+
InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
2860+
for (unsigned I = 0; I < Midpoint; ++I)
2861+
InterleavingValues[I] = Builder.CreateIntrinsic(
2862+
InterleaveTy, Intrinsic::vector_interleave2,
2863+
{InterleavingValues[I], InterleavingValues[Midpoint + I]},
2864+
/*FMFSource=*/nullptr, Name);
2865+
}
2866+
return InterleavingValues[0];
28562867
}
28572868

28582869
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2938,15 +2949,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
29382949
&InterleaveFactor](Value *MaskForGaps) -> Value * {
29392950
if (State.VF.isScalable()) {
29402951
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2941-
assert(InterleaveFactor == 2 &&
2952+
assert(isPowerOf2_32(InterleaveFactor) &&
29422953
"Unsupported deinterleave factor for scalable vectors");
29432954
auto *ResBlockInMask = State.get(BlockInMask);
2944-
SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
2945-
auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
2946-
State.VF.getKnownMinValue() * 2, true);
2947-
return State.Builder.CreateIntrinsic(
2948-
MaskTy, Intrinsic::vector_interleave2, Ops,
2949-
/*FMFSource=*/nullptr, "interleaved.mask");
2955+
SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
2956+
return interleaveVectors(State.Builder, Ops, "interleaved.mask");
29502957
}
29512958

29522959
if (!BlockInMask)
@@ -2986,22 +2993,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
29862993
ArrayRef<VPValue *> VPDefs = definedValues();
29872994
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
29882995
if (VecTy->isScalableTy()) {
2989-
assert(InterleaveFactor == 2 &&
2996+
assert(isPowerOf2_32(InterleaveFactor) &&
29902997
"Unsupported deinterleave factor for scalable vectors");
29912998

2992-
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
2993-
// so must use intrinsics to deinterleave.
2994-
Value *DI = State.Builder.CreateIntrinsic(
2995-
Intrinsic::vector_deinterleave2, VecTy, NewLoad,
2996-
/*FMFSource=*/nullptr, "strided.vec");
2997-
unsigned J = 0;
2998-
for (unsigned I = 0; I < InterleaveFactor; ++I) {
2999-
Instruction *Member = Group->getMember(I);
2999+
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
3000+
// so must use intrinsics to deinterleave.
3001+
SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
3002+
DeinterleavedValues[0] = NewLoad;
3003+
// For the case of InterleaveFactor > 2, we will have to do recursive
3004+
// deinterleaving, because the current available deinterleave intrinsic
3005+
// supports only Factor of 2, otherwise it will bailout after first
3006+
// iteration.
3007+
// When deinterleaving, the number of values will double until we
3008+
// have "InterleaveFactor".
3009+
for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
3010+
NumVectors *= 2) {
3011+
// Deinterleave the elements within the vector
3012+
SmallVector<Value *> TempDeinterleavedValues(NumVectors);
3013+
for (unsigned I = 0; I < NumVectors; ++I) {
3014+
auto *DiTy = DeinterleavedValues[I]->getType();
3015+
TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
3016+
Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
3017+
/*FMFSource=*/nullptr, "strided.vec");
3018+
}
3019+
// Extract the deinterleaved values:
3020+
for (unsigned I = 0; I < 2; ++I)
3021+
for (unsigned J = 0; J < NumVectors; ++J)
3022+
DeinterleavedValues[NumVectors * I + J] =
3023+
State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
3024+
}
30003025

3001-
if (!Member)
3026+
#ifndef NDEBUG
3027+
for (Value *Val : DeinterleavedValues)
3028+
assert(Val && "NULL Deinterleaved Value");
3029+
#endif
3030+
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
3031+
Instruction *Member = Group->getMember(I);
3032+
Value *StridedVec = DeinterleavedValues[I];
3033+
if (!Member) {
3034+
// This value is not needed as it's not used
3035+
static_cast<Instruction *>(StridedVec)->eraseFromParent();
30023036
continue;
3003-
3004-
Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
3037+
}
30053038
// If this member has different type, cast the result type.
30063039
if (Member->getType() != ScalarTy) {
30073040
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);

0 commit comments

Comments
 (0)