-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV]: Teach LV to recursively (de)interleave. #89018
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-backend-aarch64 Author: Hassnaa Hamdi (hassnaaHamdi) Changes
Patch is 32.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89018.diff 10 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..e233d430e98dd5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -59,6 +59,8 @@
#include <string>
#include <utility>
#include <vector>
+#include <stack>
+#include <queue>
namespace llvm {
@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 438ac1c3cc6e2c..73c3a63b61da3b 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -71,6 +71,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -510,12 +511,52 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+ std::stack<IntrinsicInst*> DeinterleaveTreeQueue;
+ std::queue<std::pair<unsigned, Value*>> LeafNodes;
+ std::map<IntrinsicInst*, bool>mp;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ DeinterleaveTreeQueue.push(DI);
+ unsigned DILeafCount = 0;
+ while(!DeinterleaveTreeQueue.empty()) {
+ auto CurrentDI = DeinterleaveTreeQueue.top();
+ DeinterleaveTreeQueue.pop();
+ TempDeadInsts.push_back(CurrentDI);
+ bool RootFound = false;
+ for (auto UserExtract : CurrentDI->users()) { // iterate over extract users of deinterleave
+ Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+ if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+ continue;
+ bool IsLeaf = true;
+ for (auto UserDI : UserExtract->users()) { // iterate over deinterleave users of extract
+ IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+ if (!Child_DI ||
+ Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+ continue;
+ IsLeaf = false;
+ if (mp.count(Child_DI) == 0) {
+ DeinterleaveTreeQueue.push(Child_DI);
+ }
+ continue;
+ }
+ if (IsLeaf) {
+ RootFound = true;
+ LeafNodes.push(std::make_pair(DILeafCount, UserExtract));
+ TempDeadInsts.push_back(Extract);
+ }
+ else {
+ TempDeadInsts.push_back(Extract);
+ }
+ }
+ if (RootFound)
+ DILeafCount += CurrentDI->getNumUses();
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.push_back(DI);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -531,14 +572,33 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+ std::queue<IntrinsicInst*> IeinterleaveTreeQueue;
+ std::queue<Value*> LeafNodes;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ IeinterleaveTreeQueue.push(II);
+ while(!IeinterleaveTreeQueue.empty()) {
+ auto node = IeinterleaveTreeQueue.front();
+ TempDeadInsts.push_back(node);
+ IeinterleaveTreeQueue.pop();
+ for(unsigned i = 0; i < 2; i++) {
+ auto op = node->getOperand(i);
+ if(auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+ continue;
+ IeinterleaveTreeQueue.push(CurrentII);
+ continue;
+ }
+ LeafNodes.push(op);
+ }
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
return true;
}
@@ -559,7 +619,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..ab8c01e2df5a9a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, llvm::Value*>>& LeafNodes, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16409,8 +16409,27 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateInsertValue(Result, Left, 0);
Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable)
+ if (UseScalable) {
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Result);
+ return true;
+ }
+ while (!LeafNodes.empty()) {
+ unsigned ExtractIndex = LeafNodes.front().first;
+ llvm::Value* CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices())
+ NewIndices.push_back(index + ExtractIndex);
+
+ Value *extrc =Builder.CreateExtractValue(Result, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
+ }
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16420,15 +16439,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, std::queue<Value*>& LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ // leaf nodes are the nodes that will be interleaved
+ const unsigned Factor = LeafNodes.size();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16473,8 +16492,16 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
}
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
+ if (UseScalable) {
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(Pred);
+ Args.push_back(Address);
+ Builder.CreateCall(StNFunc, Args);
+ }
else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..85497a1f7ae41a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e80931a03f30b6..35150928f0adb0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3315,15 +3315,17 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
assert(Factor >= 2 && "Invalid interleave factor");
auto *VecVTy = cast<VectorType>(VecTy);
- if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
- return InstructionCost::getInvalid();
+ unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor();
+ if (VecTy->isScalableTy() &&
+ (!ST->hasSVE() || Factor > MaxFactor))
+ return InstructionCost::getInvalid();
// Vectorization for masked interleaved accesses is only enabled for scalable
// VF.
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ if (!UseMaskForGaps && Factor <= MaxFactor) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
auto *SubVecTy =
VectorType::get(VecVTy->getElementType(),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dc7c6f83b98579..64e0a2bb1f2942 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21025,6 +21025,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21033,10 +21034,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21064,6 +21066,27 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
+ Ops.append({LI->getPointerOperand(), VL});
+ Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+ //-----------
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Vlseg);
+ return true;
+ }
+ unsigned ExtractIndex = 0;
+ while (!LeafNodes.empty()) {
+ ExtractIndex = LeafNodes.front().first;
+ auto CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices()) {
+ NewIndices.push_back(index + ExtractIndex);
+ }
+ Value *extrc = Builder.CreateExtractValue(Vlseg, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21075,6 +21098,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21083,10 +21107,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = LeafNodes.size();
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21112,6 +21136,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b10da3d40befb7..1f104cf3bc15d5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2057cab46135ff..41f8c5a72ce1e7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -154,6 +154,7 @@
#include <string>
#include <tuple>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -459,10 +460,23 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(
- WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
- /*FMFSource=*/nullptr, Name);
+ SmallVector<Value *> Vecs(Vals);
+ unsigned AllNodesNum = (2*Vals.size()) - 1;
+ // last element in the vec should be the final interleaved result,
+ // so, skip processing last element.
+ AllNodesNum --;
+ // interleave each 2 consecutive nodes, and push result to the vec,
+ // so that we can interleave the interleaved results again if we have
+ // more than 2 vectors to interleave.
+ for (unsigned i = 0; i < AllNodesNum; i +=2) {
+ VectorType *VecTy = cast<VectorType>(Vecs[i]->getType());
+ VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+ auto InterleavedVec = Builder.CreateIntrinsic(
+ WideVecTy, Intrinsic::experimental_vector_interleave2,
+ {Vecs[i], Vecs[i+1]}, /*FMFSource=*/nullptr, Name);
+ Vecs.push_back(InterleavedVec);
+ }
+ return Vecs[Vecs.size()-1];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2519,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
unsigned Part, Value *MaskForGaps) -> Value * {
if (VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *BlockInMaskPart = State.get(BlockInMask, Part);
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
@@ -2572,23 +2586,40 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
- "Unsupported deinterleave factor for scalable vectors");
-
+ assert(isPowerOf2_32(InterleaveFactor) &&
+ "Unsupported deinterleave factor for scalable vectors");
for (unsigned Part = 0; Part < UF; ++Part) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
- Value *DI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
- /*FMFSource=*/nullptr, "strided.vec");
+
+ std::queue<Value *>Queue;
+ Queue.push(NewLoads[Part]);
+ // NonLeaf represents how many times we will do deinterleaving,
+ // think of it as a tree, each node will be deinterleaved, untill we reach to
+ // the leaf nodes which will be the final results of deinterleaving.
+ unsigned NonLeaf = InterleaveFactor - 1;
+ for (unsigned i = 0; i < NonLeaf; i ++) {
+ auto Node = Queue.front();
+ Queue.pop();
+ auto DeinterleaveType = Node->getType();
+ Value *DI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_deinterleave2, DeinterleaveType, Node,
+ /*FMFSource=*/nullptr, "root.strided.vec");
+ Value *StridedVec1 = Builder.CreateExtractValue(DI, 0);
+ Value *Strid...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Hassnaa Hamdi (hassnaaHamdi) Changes
Patch is 32.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89018.diff 10 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0ade02959025f..e233d430e98dd5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -59,6 +59,8 @@
#include <string>
#include <utility>
#include <vector>
+#include <stack>
+#include <queue>
namespace llvm {
@@ -3145,6 +3147,7 @@ class TargetLoweringBase {
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
return false;
}
@@ -3156,6 +3159,7 @@ class TargetLoweringBase {
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 438ac1c3cc6e2c..73c3a63b61da3b 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -71,6 +71,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -510,12 +511,52 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+ std::stack<IntrinsicInst*> DeinterleaveTreeQueue;
+ std::queue<std::pair<unsigned, Value*>> LeafNodes;
+ std::map<IntrinsicInst*, bool>mp;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ DeinterleaveTreeQueue.push(DI);
+ unsigned DILeafCount = 0;
+ while(!DeinterleaveTreeQueue.empty()) {
+ auto CurrentDI = DeinterleaveTreeQueue.top();
+ DeinterleaveTreeQueue.pop();
+ TempDeadInsts.push_back(CurrentDI);
+ bool RootFound = false;
+ for (auto UserExtract : CurrentDI->users()) { // iterate over extract users of deinterleave
+ Instruction *Extract = dyn_cast<Instruction>(UserExtract);
+ if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
+ continue;
+ bool IsLeaf = true;
+ for (auto UserDI : UserExtract->users()) { // iterate over deinterleave users of extract
+ IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
+ if (!Child_DI ||
+ Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+ continue;
+ IsLeaf = false;
+ if (mp.count(Child_DI) == 0) {
+ DeinterleaveTreeQueue.push(Child_DI);
+ }
+ continue;
+ }
+ if (IsLeaf) {
+ RootFound = true;
+ LeafNodes.push(std::make_pair(DILeafCount, UserExtract));
+ TempDeadInsts.push_back(Extract);
+ }
+ else {
+ TempDeadInsts.push_back(Extract);
+ }
+ }
+ if (RootFound)
+ DILeafCount += CurrentDI->getNumUses();
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
return false;
// We now have a target-specific load, so delete the old one.
- DeadInsts.push_back(DI);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend());
DeadInsts.push_back(LI);
return true;
}
@@ -531,14 +572,33 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
-
+ std::queue<IntrinsicInst*> IeinterleaveTreeQueue;
+ std::queue<Value*> LeafNodes;
+ SmallVector<Instruction *> TempDeadInsts;
+
+ IeinterleaveTreeQueue.push(II);
+ while(!IeinterleaveTreeQueue.empty()) {
+ auto node = IeinterleaveTreeQueue.front();
+ TempDeadInsts.push_back(node);
+ IeinterleaveTreeQueue.pop();
+ for(unsigned i = 0; i < 2; i++) {
+ auto op = node->getOperand(i);
+ if(auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
+ if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+ continue;
+ IeinterleaveTreeQueue.push(CurrentII);
+ continue;
+ }
+ LeafNodes.push(op);
+ }
+ }
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
- DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
return true;
}
@@ -559,7 +619,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+ else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..ab8c01e2df5a9a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16345,15 +16345,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, std::queue<std::pair<unsigned, llvm::Value*>>& LeafNodes, LoadInst *LI) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ const unsigned Factor = std::max(2, (int)LeafNodes.size());
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *VTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16409,8 +16409,27 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
Result = Builder.CreateInsertValue(Result, Left, 0);
Result = Builder.CreateInsertValue(Result, Right, 1);
} else {
- if (UseScalable)
+ if (UseScalable) {
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Result);
+ return true;
+ }
+ while (!LeafNodes.empty()) {
+ unsigned ExtractIndex = LeafNodes.front().first;
+ llvm::Value* CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices())
+ NewIndices.push_back(index + ExtractIndex);
+
+ Value *extrc =Builder.CreateExtractValue(Result, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
+ }
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
}
@@ -16420,15 +16439,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
}
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, std::queue<Value*>& LeafNodes, StoreInst *SI) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ // leaf nodes are the nodes that will be interleaved
+ const unsigned Factor = LeafNodes.size();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
const DataLayout &DL = II->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16473,8 +16492,16 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
}
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
+ if (UseScalable) {
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(Pred);
+ Args.push_back(Address);
+ Builder.CreateCall(StNFunc, Args);
+ }
else
Builder.CreateCall(StNFunc, {L, R, Address});
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..85497a1f7ae41a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool isLegalAddImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e80931a03f30b6..35150928f0adb0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3315,15 +3315,17 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
assert(Factor >= 2 && "Invalid interleave factor");
auto *VecVTy = cast<VectorType>(VecTy);
- if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
- return InstructionCost::getInvalid();
+ unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor();
+ if (VecTy->isScalableTy() &&
+ (!ST->hasSVE() || Factor > MaxFactor))
+ return InstructionCost::getInvalid();
// Vectorization for masked interleaved accesses is only enabled for scalable
// VF.
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
return InstructionCost::getInvalid();
- if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ if (!UseMaskForGaps && Factor <= MaxFactor) {
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
auto *SubVecTy =
VectorType::get(VecVTy->getElementType(),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dc7c6f83b98579..64e0a2bb1f2942 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21025,6 +21025,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
}
bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21033,10 +21034,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = std::max(2, (int)LeafNodes.size());
VectorType *VTy = cast<VectorType>(DI->getOperand(0)->getType());
- VectorType *ResVTy = cast<VectorType>(DI->getType()->getContainedType(0));
+ VectorType *ResVTy = (LeafNodes.size() > 0) ? cast<VectorType>(LeafNodes.front().second->getType()) :
+ cast<VectorType>(DI->getType()->getContainedType(0));
if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(),
@@ -21064,6 +21066,27 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
{ResVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
Ops.append(Factor, PoisonValue::get(ResVTy));
+ Ops.append({LI->getPointerOperand(), VL});
+ Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops);
+ //-----------
+ if (Factor == 2) {
+ DI->replaceAllUsesWith(Vlseg);
+ return true;
+ }
+ unsigned ExtractIndex = 0;
+ while (!LeafNodes.empty()) {
+ ExtractIndex = LeafNodes.front().first;
+ auto CurrentExtract = LeafNodes.front().second;
+ LeafNodes.pop();
+ ExtractValueInst* ExtractValueInst = dyn_cast<llvm::ExtractValueInst>(CurrentExtract);
+ SmallVector<unsigned, 4> NewIndices;
+ for (auto index : ExtractValueInst->indices()) {
+ NewIndices.push_back(index + ExtractIndex);
+ }
+ Value *extrc = Builder.CreateExtractValue(Vlseg, NewIndices);
+ CurrentExtract->replaceAllUsesWith(extrc);
+ }
+ return true;
}
Ops.append({LI->getPointerOperand(), VL});
@@ -21075,6 +21098,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
}
bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
@@ -21083,10 +21107,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
return false;
- unsigned Factor = 2;
+ unsigned Factor = LeafNodes.size();
VectorType *VTy = cast<VectorType>(II->getType());
- VectorType *InVTy = cast<VectorType>(II->getOperand(0)->getType());
+ VectorType *InVTy = cast<VectorType>(LeafNodes.front()->getType());
if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(),
SI->getPointerAddressSpace(),
@@ -21112,6 +21136,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2],
{InVTy, XLenTy});
VL = Constant::getAllOnesValue(XLenTy);
+ SmallVector<Value *> Args;
+ while (!LeafNodes.empty()) {
+ Args.push_back(LeafNodes.front());
+ LeafNodes.pop();
+ }
+ Args.push_back(SI->getPointerOperand());
+ Args.push_back(VL);
+ Builder.CreateCall(VssegNFunc, Args);
+ return true;
}
Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1),
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b10da3d40befb7..1f104cf3bc15d5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
+ bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
+ std::queue<std::pair<unsigned, Value*>>& LeafNodes,
LoadInst *LI) const override;
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
+ std::queue<Value*>& LeafNodes,
StoreInst *SI) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2057cab46135ff..41f8c5a72ce1e7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -154,6 +154,7 @@
#include <string>
#include <tuple>
#include <utility>
+#include <queue>
using namespace llvm;
@@ -459,10 +460,23 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(
- WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
- /*FMFSource=*/nullptr, Name);
+ SmallVector<Value *> Vecs(Vals);
+ unsigned AllNodesNum = (2*Vals.size()) - 1;
+ // last element in the vec should be the final interleaved result,
+ // so, skip processing last element.
+ AllNodesNum --;
+ // interleave each 2 consecutive nodes, and push result to the vec,
+ // so that we can interleave the interleaved results again if we have
+ // more than 2 vectors to interleave.
+ for (unsigned i = 0; i < AllNodesNum; i +=2) {
+ VectorType *VecTy = cast<VectorType>(Vecs[i]->getType());
+ VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
+ auto InterleavedVec = Builder.CreateIntrinsic(
+ WideVecTy, Intrinsic::experimental_vector_interleave2,
+ {Vecs[i], Vecs[i+1]}, /*FMFSource=*/nullptr, Name);
+ Vecs.push_back(InterleavedVec);
+ }
+ return Vecs[Vecs.size()-1];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2519,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
unsigned Part, Value *MaskForGaps) -> Value * {
if (VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *BlockInMaskPart = State.get(BlockInMask, Part);
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
@@ -2572,23 +2586,40 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
}
if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
- "Unsupported deinterleave factor for scalable vectors");
-
+ assert(isPowerOf2_32(InterleaveFactor) &&
+ "Unsupported deinterleave factor for scalable vectors");
for (unsigned Part = 0; Part < UF; ++Part) {
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
- Value *DI = Builder.CreateIntrinsic(
- Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
- /*FMFSource=*/nullptr, "strided.vec");
+
+ std::queue<Value *>Queue;
+ Queue.push(NewLoads[Part]);
+ // NonLeaf represents how many times we will do deinterleaving,
+ // think of it as a tree, each node will be deinterleaved, untill we reach to
+ // the leaf nodes which will be the final results of deinterleaving.
+ unsigned NonLeaf = InterleaveFactor - 1;
+ for (unsigned i = 0; i < NonLeaf; i ++) {
+ auto Node = Queue.front();
+ Queue.pop();
+ auto DeinterleaveType = Node->getType();
+ Value *DI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vector_deinterleave2, DeinterleaveType, Node,
+ /*FMFSource=*/nullptr, "root.strided.vec");
+ Value *StridedVec1 = Builder.CreateExtractValue(DI, 0);
+ Value *Strid...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
It loos like at the TargetLowering, LV and InterleavedAccessPass changes could be decoupled? |
@@ -2519,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( | |||
unsigned Part, Value *MaskForGaps) -> Value * { | |||
if (VF.isScalable()) { | |||
assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); | |||
assert(InterleaveFactor == 2 && | |||
assert(isPowerOf2_32(InterleaveFactor) && | |||
"Unsupported deinterleave factor for scalable vectors"); | |||
auto *BlockInMaskPart = State.get(BlockInMask, Part); | |||
SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The mask of masked interleaved accesses also requires an interleave tree to generate the correct mask.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please give an example for a case that uses masked interleaved accesses ?
I have commented the code of creating masked load (the call to CreateGroupMask lambda function), and reran the tests but all tests ran successfully. It seems that for the interleaved accesses all the loads are aligned not masked.
// think of it as a tree, each node will be deinterleaved, untill we reach to | ||
// the leaf nodes which will be the final results of deinterleaving. | ||
unsigned NonLeaf = InterleaveFactor - 1; | ||
for (unsigned i = 0; i < NonLeaf; i ++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i
--> I
i ++
--> I++
auto StridedVec = Queue.front(); | ||
Queue.pop(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here is a example:
A vector 0 1 2 3 4 5 6 7
If we do deinterleave 4 on the vector, we should get:
member 0: 0 4
member 1: 1 5
member 2: 2 6
member 3: 3 7
But the Queue in your change may like: 0 4, 2 6, 1 5, 3 7.
Please confirm the Queue is sorted by a correct rank.
@@ -2681,6 +2712,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( | |||
|
|||
// Interleave all the smaller vectors into one wider vector. | |||
Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); | |||
//LLVM_DEBUG(dbgs() << "interleaved vec: "; IVec->dump()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove it.
I agree. Please can you split this PR into two PRs with the first only concerned with the correct lowering of the new emulated ld4/st4 IR sequence. The second PR then teaches LoopVectorize how to use/generate them. |
Hi - Is there a plan for how to handle ld3? We have seen a lot of issues recently with the canonical shuffle representation for fixed-vector ld2/ld3/ld4, and I was wondering if it made sense to move away from shuffles for fixed-length too. |
There is but that will require a new intrinsic. My hope is that rather than having an intrinsic per interleave factor we could model them all using interleave2 and interleave3 (once it's created). This is why we've started with ld4/st4 support to see if there are any pitfalls to this approach. Personally I'd love us to move to using these intrinsics for all vector types because it will streamline several code paths. |
Doing deinterleaving as trees sort of makes sense for high interleaving factors... I've seen loops that benefit from deinterleaving with interleave factors as high as 12. I'm a little concerned the abstraction layers here are going to make cost modeling less accurate, though; ideally, the vectorizer should be able to estimate the cost of an ld4. |
RISC-V has interleave loads for up to 8. So I guess we would need interleave5 and interleave7? |
Yes, sorry. I guess I meant "Hopefully we can emulate all required interleave factors by only implement specific intrinsics for factors that are a prime number"? An alternative proposal is to have intrinsics for all but then lower them to sequences of fewer intrinsics within the InterleavedAccess pass or perhaps even SelectionDAGBuilder. I suppose this really depends on how awkward cost modelling the sequences turns out to be. @efriedma-quic - Is your concern related to vectorisation or the costing of already vectorised code? |
Given the way the pass pipeline is structured, cost modeling in the vectorizer itself tends to be more important than modeling in subsequent passes. I guess maybe it's not a big deal what the vectorizer generates if the vectorizer itself has some way to get the correct numbers. |
The loop vectorizer will produce costs via getInterleavedMemoryOpCost so should be fine as far as I understand. If there are no combines later on (either uncosted in instcombine or costed in vector-combine) that work with vector.interleave/vector.deinterleave then they can break the canonical patterns that the backend is expecting to generate ld2/ld4 from. I'm hoping that if we can move to interleave/deinterleave, that should fix some of the problems we have at the moment. I have recently been adding costs for the existing shuffles we find for fixed length vectors, in an attempt to reduce the number of times we break apart the load+shuffle (or store+shuffle), and have to either attempt to repair it or fall back to worse generation in the backend. I would say that in general costing for single-instructions is fine, two instructions making a pattern (like shuffle(load) or store(shuffle)) are do-able but start to get unreliable, and three-instruction plus becomes difficult to cost well. |
0931f25
to
ef3a8ea
Compare
unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor(); | ||
if (VecTy->isScalableTy() && | ||
(!ST->hasSVE() || !isPowerOf2_32(Factor) || Factor > MaxFactor)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Whilst this works I think it's much clearer to simply say !ST->hasSVE() || (Factor != 2 && Factor != 4)
.
For what it's worth I don't see getMaxSupportedInterleaveFactor()
being a good function because it doesn't provide enough context for the question it is asking (i.e. it assumes the vector types does not matter). The only reason we don't run in to trouble is because other than this function all other uses are specific to fixed length vector types.
for (unsigned I = 0, J = InterleaveFactor / 2, K = 0; K < InterleaveFactor; | ||
K++) { | ||
if (K % 2 == 0) { | ||
InterleavingValues[K] = Vals[I]; | ||
I++; | ||
} else { | ||
InterleavingValues[K] = Vals[J]; | ||
J++; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would the following simplification work?
for (unsigned I = 0; I < InterleaveFactor/2; ++I) {
InterleavingValues[2*I] = Value[I];
InterleavingValues[2*I+1] = Value[I + InterleaveFactor/2];
}
Simplification aside, does this two stage algorithm work? Or rather, I'm pretty sure it doesn't work, but I'm unsure if there are intentional restrictions that means it is only supposed to work for specific factors.
I could be wrong but I think the algorithm works for InterleavingValues==2 and InterleavingValues==4 but fails for InterleavingValues==8. This would be kind of ok given the original code only worked for InterleavingValues==2, but the other changes in this PR (and the new code's complexity) imply you expect the algorithm to support all powers-of-two?
It would be good to know your intent here because then I can either suggest simplifying the code or help fix the algorithm if my observation is valid.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, my intent is to make it generic.
I think to make it generic, There will be multiple sorting during the interleave/deinterleave, not only at the end. correct ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes I believe so. It'll be a continuous process of "ordering the operands and then interleaving them" until you have only one vector (or continuous "deinterleave and then order the results" until you have the required N vectors).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
#ifndef NDEBUG | ||
for (Value *Val : InterleavingValues) | ||
assert(Val && "NULL Interleaving Value"); | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this assert add any value?
From the code it can be seen that InterleavingValues
has InterleaveFactor
elements, which itself is the size of Vals
, and the loop goes from 0:InterleaveFactor. This means the only way InterleavingValues
can have a NULL entry is if it came from Vals
, which cannot happen because there's already an assert above where the type of each element of Vals
is checked (i.e. all the Value* have been dereference by this point anyway).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
Would it be possible to directly add (de)interleave intrinsic 4 to achieve this? Such an implementation should be simpler and more maintainable. |
A significant risk of not being able to identify the larger interleave factors would be the main reason to introduce dedicated intrinsics. However, I'd expect the IR to emulate an 8-way interleave to be pretty fixed so I'd rather wait to see if this is proved incorrect before going down that route. At the end of the day the code this PR will introduce will be required anyway to lower (de)interleave intrinsics a target does not support, so there shouldn't be much wasted effort. |
Thanks Paul for reviewing the patch. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for your contribution. Here is some questions for this patch:
- I believe we requires InterleavedAccessPass(IAP) support before this patch. Does IAP now have the ability to convert the power of 2 factor?
- Do you have lit test cases for masked interleaved accesses and reverse interleaved accesses with power of 2 factor?
- Last question is for the future work. RISCV supports factor 6. If we have interleave3/deinterleave3 intrinsics, can the approach in this patch support factor 6? Or will it require a lot of modifications to support factor 6?
// single final interleaved value. | ||
VectorType *InterleaveTy = | ||
cast<VectorType>(InterleavingValues[0]->getType()); | ||
for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add assertion for confirming Factor is power of 2.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @Mel-Chen
Thanks for looking at the patch.
The assert statement is already added before calling the interleaveVectors(..) function.
About your questions above:
- yes I have landed a patch for adding support to the InterleaveAccessPass to support reading the (de)interleave tree pattern.
- Adding them.
- If we have (de)interleave3 intrinsics, then we will have to do same logic for recursive (de)interleave3, and then the extra needed work will be representing the interleave factor by multiple of 2 and 3. so for the case of factor 6, we will do single iteration of (de)interleave2 then single iteration of (de)interleave3. The same logic will be applied for all factors that consist of multiples of 2 and 3 only.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @Mel-Chen
Are you satisfied about the latest changes ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The assert statement is already added before calling the interleaveVectors(..) function.
I think we still need an assert before the for loop, as this is a standalone function. This will ensure that no caller inadvertently passes an invalid factor in the future.
@@ -136,3 +136,4 @@ define void @negative_deinterleave4_test(ptr %src) { | |||
|
|||
ret void | |||
} | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove this blank line.
// single final interleaved value. | ||
VectorType *InterleaveTy = | ||
cast<VectorType>(InterleavingValues[0]->getType()); | ||
for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The assert statement is already added before calling the interleaveVectors(..) function.
I think we still need an assert before the for loop, as this is a standalone function. This will ensure that no caller inadvertently passes an invalid factor in the future.
Hi @Mel-Chen |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This PR appears to be breaking LLVM build on AArch64 SVE Linux buildbots |
This reverts commit ccfe0de. This breaks LLVM build on AArch64 SVE Linux buildbots https://lab.llvm.org/buildbot/#/builders/143/builds/4462 https://lab.llvm.org/buildbot/#/builders/17/builds/4902 https://lab.llvm.org/buildbot/#/builders/4/builds/4399 https://lab.llvm.org/buildbot/#/builders/41/builds/4299
I have reverted this change temporarily to fix the buildbots. Please review the change. Thanks! |
This patch relands the changes from "[LV]: Teach LV to recursively (de)interleave.#122989" Reason for revert: - The patch exposed an assert in the vectorizer related to VF difference between legacy cost model and VPlan-based cost model because of uncalculated cost for VPInstruction which is created by VPlanTransforms as a replacement to 'or disjoint' instruction. VPlanTransforms do that instructions change when there are memory interleaving and predicated blocks, but that change didn't cause problems because at most cases the cost difference between legacy/new models is not noticeable. - Issue is fixed by #125434 Original patch: #89018 Reviewed-by: paulwalker-arm, Mel-Chen
…25094) This patch relands the changes from "[LV]: Teach LV to recursively (de)interleave.#122989" Reason for revert: - The patch exposed an assert in the vectorizer related to VF difference between legacy cost model and VPlan-based cost model because of uncalculated cost for VPInstruction which is created by VPlanTransforms as a replacement to 'or disjoint' instruction. VPlanTransforms do that instructions change when there are memory interleaving and predicated blocks, but that change didn't cause problems because at most cases the cost difference between legacy/new models is not noticeable. - Issue is fixed by #125434 Original patch: llvm/llvm-project#89018 Reviewed-by: paulwalker-arm, Mel-Chen
This patch relands the changes from "[LV]: Teach LV to recursively (de)interleave.llvm#122989" Reason for revert: - The patch exposed an assert in the vectorizer related to VF difference between legacy cost model and VPlan-based cost model because of uncalculated cost for VPInstruction which is created by VPlanTransforms as a replacement to 'or disjoint' instruction. VPlanTransforms do that instructions change when there are memory interleaving and predicated blocks, but that change didn't cause problems because at most cases the cost difference between legacy/new models is not noticeable. - Issue is fixed by llvm#125434 Original patch: llvm#89018 Reviewed-by: paulwalker-arm, Mel-Chen
This adds [de]interleave intrinsics for factors of 4,6,8, so that every interleaved memory operation supported by the in-tree targets can be represented by a single intrinsic. For context, [de]interleaves of fixed-length vectors are represented by a series of shufflevectors. The intrinsics are needed for scalable vectors, and we don't currently scalably vectorize all possible factors of interleave groups supported by RISC-V/AArch64. The underlying reason for this is that higher factors are currently represented by interleaving multiple interleaves themselves, which made sense at the time in the discussion in #89018. But after trying to integrate these for higher factors on RISC-V I think we should revisit this design choice: - Matching these in InterleavedAccessPass is non-trivial: We currently only support factors that are a power of 2, and detecting this requires a good chunk of code - The shufflevector masks used for [de]interleaves of fixed-length vectors are much easier to pattern match as they are strided patterns, but for the intrinsics it's much more complicated to match as the structure is a tree. - Unlike shufflevectors, there's no optimisation that happens on [de]interleave2 intriniscs - For non-power-of-2 factors e.g. 6, there are multiple possible ways a [de]interleave could be represented, see the discussion in #139373 - We already have intrinsics for 2,3,5 and 7, so by avoiding 4,6 and 8 we're not really saving much By representing these higher factors are interleaved-interleaves, we can in theory support arbitrarily high interleave factors. However I'm not sure this is actually needed in practice: SVE only has instructions for factors 2,3,4, whilst RVV only supports up to factor 8. This patch would make it much easier to support scalable interleaved accesses in the loop vectorizer for RISC-V for factors 3,5,6 and 7, as the loop vectorizer and InterleavedAccessPass wouldn't need to construct and match trees of interleaves. For interleave factors above 8, for which there are no hardware memory operations to match in the InterleavedAccessPass, we can still keep the wide load + recursive interleaving in the loop vectorizer.
This adds [de]interleave intrinsics for factors of 4,6,8, so that every interleaved memory operation supported by the in-tree targets can be represented by a single intrinsic. For context, [de]interleaves of fixed-length vectors are represented by a series of shufflevectors. The intrinsics are needed for scalable vectors, and we don't currently scalably vectorize all possible factors of interleave groups supported by RISC-V/AArch64. The underlying reason for this is that higher factors are currently represented by interleaving multiple interleaves themselves, which made sense at the time in the discussion in llvm/llvm-project#89018. But after trying to integrate these for higher factors on RISC-V I think we should revisit this design choice: - Matching these in InterleavedAccessPass is non-trivial: We currently only support factors that are a power of 2, and detecting this requires a good chunk of code - The shufflevector masks used for [de]interleaves of fixed-length vectors are much easier to pattern match as they are strided patterns, but for the intrinsics it's much more complicated to match as the structure is a tree. - Unlike shufflevectors, there's no optimisation that happens on [de]interleave2 intriniscs - For non-power-of-2 factors e.g. 6, there are multiple possible ways a [de]interleave could be represented, see the discussion in #139373 - We already have intrinsics for 2,3,5 and 7, so by avoiding 4,6 and 8 we're not really saving much By representing these higher factors are interleaved-interleaves, we can in theory support arbitrarily high interleave factors. However I'm not sure this is actually needed in practice: SVE only has instructions for factors 2,3,4, whilst RVV only supports up to factor 8. This patch would make it much easier to support scalable interleaved accesses in the loop vectorizer for RISC-V for factors 3,5,6 and 7, as the loop vectorizer and InterleavedAccessPass wouldn't need to construct and match trees of interleaves. For interleave factors above 8, for which there are no hardware memory operations to match in the InterleavedAccessPass, we can still keep the wide load + recursive interleaving in the loop vectorizer.
This adds [de]interleave intrinsics for factors of 4,6,8, so that every interleaved memory operation supported by the in-tree targets can be represented by a single intrinsic. For context, [de]interleaves of fixed-length vectors are represented by a series of shufflevectors. The intrinsics are needed for scalable vectors, and we don't currently scalably vectorize all possible factors of interleave groups supported by RISC-V/AArch64. The underlying reason for this is that higher factors are currently represented by interleaving multiple interleaves themselves, which made sense at the time in the discussion in llvm#89018. But after trying to integrate these for higher factors on RISC-V I think we should revisit this design choice: - Matching these in InterleavedAccessPass is non-trivial: We currently only support factors that are a power of 2, and detecting this requires a good chunk of code - The shufflevector masks used for [de]interleaves of fixed-length vectors are much easier to pattern match as they are strided patterns, but for the intrinsics it's much more complicated to match as the structure is a tree. - Unlike shufflevectors, there's no optimisation that happens on [de]interleave2 intriniscs - For non-power-of-2 factors e.g. 6, there are multiple possible ways a [de]interleave could be represented, see the discussion in llvm#139373 - We already have intrinsics for 2,3,5 and 7, so by avoiding 4,6 and 8 we're not really saving much By representing these higher factors are interleaved-interleaves, we can in theory support arbitrarily high interleave factors. However I'm not sure this is actually needed in practice: SVE only has instructions for factors 2,3,4, whilst RVV only supports up to factor 8. This patch would make it much easier to support scalable interleaved accesses in the loop vectorizer for RISC-V for factors 3,5,6 and 7, as the loop vectorizer and InterleavedAccessPass wouldn't need to construct and match trees of interleaves. For interleave factors above 8, for which there are no hardware memory operations to match in the InterleavedAccessPass, we can still keep the wide load + recursive interleaving in the loop vectorizer.
Currently available intrinsics are only ld2/st2, which don't support interleaving factor > 2.
This patch teaches the LV to use ld2/st2 recursively to support high interleaving factors.