Skip to content

Commit dec7c6b

Browse files
committed
[AArch64]: Use PatternMatch to spot (de)interleave accesses
Change-Id: Id7639dcb125a2f642b2fea78ea884b74be1c6b74
1 parent 71120ec commit dec7c6b

File tree

9 files changed

+143
-366
lines changed

9 files changed

+143
-366
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,6 @@
5656
#include <cstdint>
5757
#include <iterator>
5858
#include <map>
59-
#include <queue>
60-
#include <stack>
6159
#include <string>
6260
#include <utility>
6361
#include <vector>
@@ -3159,7 +3157,6 @@ class TargetLoweringBase {
31593157
/// \p DI is the deinterleave intrinsic.
31603158
/// \p LI is the accompanying load instruction
31613159
virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
3162-
SmallVector<Value *> &LeafNodes,
31633160
LoadInst *LI) const {
31643161
return false;
31653162
}
@@ -3171,7 +3168,6 @@ class TargetLoweringBase {
31713168
/// \p II is the interleave intrinsic.
31723169
/// \p SI is the accompanying store instruction
31733170
virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
3174-
SmallVector<Value *> &LeafNodes,
31753171
StoreInst *SI) const {
31763172
return false;
31773173
}

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 6 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@
7070
#include "llvm/Target/TargetMachine.h"
7171
#include "llvm/Transforms/Utils/Local.h"
7272
#include <cassert>
73-
#include <queue>
7473
#include <utility>
7574

7675
using namespace llvm;
@@ -489,57 +488,12 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
489488

490489
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
491490

492-
std::stack<IntrinsicInst *> DeinterleaveTreeQueue;
493-
SmallVector<Value *> TempLeafNodes, LeafNodes;
494-
std::map<IntrinsicInst *, bool> mp;
495-
SmallVector<Instruction *> TempDeadInsts;
496-
497-
DeinterleaveTreeQueue.push(DI);
498-
while (!DeinterleaveTreeQueue.empty()) {
499-
auto CurrentDI = DeinterleaveTreeQueue.top();
500-
DeinterleaveTreeQueue.pop();
501-
TempDeadInsts.push_back(CurrentDI);
502-
// iterate over extract users of deinterleave
503-
for (auto UserExtract : CurrentDI->users()) {
504-
Instruction *Extract = dyn_cast<Instruction>(UserExtract);
505-
if (!Extract || Extract->getOpcode() != Instruction::ExtractValue)
506-
continue;
507-
bool IsLeaf = true;
508-
// iterate over deinterleave users of extract
509-
for (auto UserDI : UserExtract->users()) {
510-
IntrinsicInst *Child_DI = dyn_cast<IntrinsicInst>(UserDI);
511-
if (!Child_DI || Child_DI->getIntrinsicID() !=
512-
Intrinsic::experimental_vector_deinterleave2)
513-
continue;
514-
IsLeaf = false;
515-
if (mp.count(Child_DI) == 0) {
516-
DeinterleaveTreeQueue.push(Child_DI);
517-
}
518-
continue;
519-
}
520-
if (IsLeaf) {
521-
TempLeafNodes.push_back(UserExtract);
522-
TempDeadInsts.push_back(Extract);
523-
} else {
524-
TempDeadInsts.push_back(Extract);
525-
}
526-
}
527-
}
528-
// sort the deinterleaved nodes in the order that
529-
// they will be extracted from the target-specific intrinsic.
530-
for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
531-
LeafNodes.push_back(TempLeafNodes[I]);
532-
533-
for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
534-
LeafNodes.push_back(TempLeafNodes[I]);
535-
536491
// Try and match this with target specific intrinsics.
537-
if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI))
492+
if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
538493
return false;
539494

540495
// We now have a target-specific load, so delete the old one.
541-
DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(),
542-
TempDeadInsts.rend());
496+
DeadInsts.push_back(DI);
543497
DeadInsts.push_back(LI);
544498
return true;
545499
}
@@ -555,38 +509,14 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
555509
return false;
556510

557511
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
558-
std::queue<IntrinsicInst *> IeinterleaveTreeQueue;
559-
SmallVector<Value *> TempLeafNodes, LeafNodes;
560-
SmallVector<Instruction *> TempDeadInsts;
561-
562-
IeinterleaveTreeQueue.push(II);
563-
while (!IeinterleaveTreeQueue.empty()) {
564-
auto node = IeinterleaveTreeQueue.front();
565-
TempDeadInsts.push_back(node);
566-
IeinterleaveTreeQueue.pop();
567-
for (unsigned i = 0; i < 2; i++) {
568-
auto op = node->getOperand(i);
569-
if (auto CurrentII = dyn_cast<IntrinsicInst>(op)) {
570-
if (CurrentII->getIntrinsicID() !=
571-
Intrinsic::experimental_vector_interleave2)
572-
continue;
573-
IeinterleaveTreeQueue.push(CurrentII);
574-
continue;
575-
}
576-
TempLeafNodes.push_back(op);
577-
}
578-
}
579-
for (unsigned I = 0; I < TempLeafNodes.size(); I += 2)
580-
LeafNodes.push_back(TempLeafNodes[I]);
581-
for (unsigned I = 1; I < TempLeafNodes.size(); I += 2)
582-
LeafNodes.push_back(TempLeafNodes[I]);
512+
583513
// Try and match this with target specific intrinsics.
584-
if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI))
514+
if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
585515
return false;
586516

587517
// We now have a target-specific store, so delete the old one.
588518
DeadInsts.push_back(SI);
589-
DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end());
519+
DeadInsts.push_back(II);
590520
return true;
591521
}
592522

@@ -607,8 +537,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
607537
// with a factor of 2.
608538
if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
609539
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
610-
611-
else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
540+
if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
612541
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
613542
}
614543
}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 104 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -16441,18 +16441,56 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
1644116441
return true;
1644216442
}
1644316443

16444+
bool GetDeinterleaveLeaves(Value *DI,
16445+
SmallVectorImpl<Value *> &DeinterleaveUsers,
16446+
SmallVectorImpl<Instruction *> &DeadInsts) {
16447+
if (!DI->hasNUses(2))
16448+
return false;
16449+
16450+
auto *Extr0 = *(++DI->user_begin());
16451+
auto *Extr1 = *(DI->user_begin());
16452+
if (!match(Extr0, m_ExtractValue<0>(m_Deinterleave2(m_Value()))))
16453+
return false;
16454+
16455+
auto De1 = *(Extr0->user_begin());
16456+
if (!GetDeinterleaveLeaves(De1, DeinterleaveUsers, DeadInsts))
16457+
// leaf extract
16458+
DeinterleaveUsers.push_back(Extr0);
16459+
else {
16460+
// parent extract that will not be used anymore
16461+
DeadInsts.push_back(dyn_cast<Instruction>(De1));
16462+
DeadInsts.push_back(dyn_cast<Instruction>(Extr0));
16463+
}
16464+
auto De2 = *(Extr1->user_begin());
16465+
if (!GetDeinterleaveLeaves(De2, DeinterleaveUsers, DeadInsts))
16466+
// leaf extract
16467+
DeinterleaveUsers.push_back(Extr1);
16468+
else {
16469+
// parent extract that will not be used anymore
16470+
DeadInsts.push_back(dyn_cast<Instruction>(De2));
16471+
DeadInsts.push_back(dyn_cast<Instruction>(Extr1));
16472+
}
16473+
return true;
16474+
}
16475+
1644416476
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16445-
IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
16477+
IntrinsicInst *DI, LoadInst *LI) const {
1644616478
// Only deinterleave2 supported at present.
1644716479
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
1644816480
return false;
1644916481

16450-
const unsigned Factor = std::max(2, (int)LeafNodes.size());
16451-
16452-
VectorType *VTy = (LeafNodes.size() > 0)
16453-
? cast<VectorType>(LeafNodes.front()->getType())
16454-
: cast<VectorType>(DI->getType()->getContainedType(0));
16482+
SmallVector<Value *, 4> ValuesToDeinterleave;
16483+
SmallVector<Instruction *, 10> DeadInsts;
1645516484
const DataLayout &DL = DI->getModule()->getDataLayout();
16485+
unsigned Factor = 2;
16486+
VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16487+
if (GetDeinterleaveLeaves(DI, ValuesToDeinterleave, DeadInsts)) {
16488+
Factor = ValuesToDeinterleave.size();
16489+
VTy = cast<VectorType>(ValuesToDeinterleave[0]->getType());
16490+
}
16491+
16492+
assert(Factor && "Expected Interleave Factor >= 2");
16493+
1645616494
bool UseScalable;
1645716495
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
1645816496
return false;
@@ -16463,7 +16501,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1646316501
return false;
1646416502

1646516503
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16466-
1646716504
VectorType *LdTy =
1646816505
VectorType::get(VTy->getElementType(),
1646916506
VTy->getElementCount().divideCoefficientBy(NumLoads));
@@ -16473,7 +16510,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1647316510
UseScalable, LdTy, PtrTy);
1647416511

1647516512
IRBuilder<> Builder(LI);
16476-
1647716513
Value *Pred = nullptr;
1647816514
if (UseScalable)
1647916515
Pred =
@@ -16482,9 +16518,8 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1648216518
Value *BaseAddr = LI->getPointerOperand();
1648316519
Value *Result;
1648416520
if (NumLoads > 1) {
16485-
Value *Left = PoisonValue::get(VTy);
16486-
Value *Right = PoisonValue::get(VTy);
16487-
16521+
// Create multiple legal small ldN instead of a wide one.
16522+
SmallVector<Value *, 4> WideValues(Factor, (PoisonValue::get(VTy)));
1648816523
for (unsigned I = 0; I < NumLoads; ++I) {
1648916524
Value *Offset = Builder.getInt64(I * Factor);
1649016525

@@ -16494,49 +16529,71 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1649416529
LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
1649516530
else
1649616531
LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16497-
1649816532
Value *Idx =
1649916533
Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16500-
Left = Builder.CreateInsertVector(
16501-
VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16502-
Right = Builder.CreateInsertVector(
16503-
VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16534+
for (int J = 0; J < Factor; ++J) {
16535+
WideValues[J] = Builder.CreateInsertVector(
16536+
VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
16537+
}
16538+
}
16539+
// FIXME: the types should NOT be added manually.
16540+
if (2 == Factor)
16541+
Result = PoisonValue::get(StructType::get(VTy, VTy));
16542+
else
16543+
Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
16544+
// Construct the wide result out of the small results.
16545+
for (int J = 0; J < Factor; ++J) {
16546+
Result = Builder.CreateInsertValue(Result, WideValues[J], J);
1650416547
}
16505-
16506-
Result = PoisonValue::get(DI->getType());
16507-
Result = Builder.CreateInsertValue(Result, Left, 0);
16508-
Result = Builder.CreateInsertValue(Result, Right, 1);
1650916548
} else {
16510-
if (UseScalable) {
16549+
if (UseScalable)
1651116550
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16512-
if (Factor == 2) {
16513-
DI->replaceAllUsesWith(Result);
16514-
return true;
16515-
}
16516-
for (unsigned I = 0; I < LeafNodes.size(); I++) {
16517-
llvm::Value *CurrentExtract = LeafNodes[I];
16518-
Value *Newextrct = Builder.CreateExtractValue(Result, I);
16519-
CurrentExtract->replaceAllUsesWith(Newextrct);
16520-
}
16521-
return true;
16522-
} else
16551+
else
1652316552
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
1652416553
}
16554+
if (Factor > 2) {
16555+
for (unsigned I = 0; I < ValuesToDeinterleave.size(); I++) {
16556+
llvm::Value *CurrentExtract = ValuesToDeinterleave[I];
16557+
Value *NewExtract = Builder.CreateExtractValue(Result, I);
16558+
CurrentExtract->replaceAllUsesWith(NewExtract);
16559+
dyn_cast<Instruction>(CurrentExtract)->eraseFromParent();
16560+
}
1652516561

16562+
for (auto &dead : DeadInsts)
16563+
dead->eraseFromParent();
16564+
return true;
16565+
}
1652616566
DI->replaceAllUsesWith(Result);
1652716567
return true;
1652816568
}
1652916569

16570+
bool GetInterleaveLeaves(Value *II, SmallVectorImpl<Value *> &InterleaveOps) {
16571+
Value *Op0, *Op1;
16572+
if (!match(II, m_Interleave2(m_Value(Op0), m_Value(Op1))))
16573+
return false;
16574+
16575+
if (!GetInterleaveLeaves(Op0, InterleaveOps)) {
16576+
InterleaveOps.push_back(Op0);
16577+
}
16578+
16579+
if (!GetInterleaveLeaves(Op1, InterleaveOps)) {
16580+
InterleaveOps.push_back(Op1);
16581+
}
16582+
return true;
16583+
}
16584+
1653016585
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
16531-
IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
16586+
IntrinsicInst *II, StoreInst *SI) const {
1653216587
// Only interleave2 supported at present.
1653316588
if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
1653416589
return false;
1653516590

16536-
// leaf nodes are the nodes that will be interleaved
16537-
const unsigned Factor = LeafNodes.size();
16591+
SmallVector<Value *, 4> ValuesToInterleave;
16592+
GetInterleaveLeaves(II, ValuesToInterleave);
16593+
unsigned Factor = ValuesToInterleave.size();
16594+
assert(Factor >= 2 && "Expected Interleave Factor >= 2");
16595+
VectorType *VTy = cast<VectorType>(ValuesToInterleave[0]->getType());
1653816596

16539-
VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
1654016597
const DataLayout &DL = II->getModule()->getDataLayout();
1654116598
bool UseScalable;
1654216599
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16566,28 +16623,26 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
1656616623
Pred =
1656716624
Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
1656816625

16569-
Value *L = II->getOperand(0);
16570-
Value *R = II->getOperand(1);
16571-
16626+
auto InterleaveOps = ValuesToInterleave;
16627+
if (UseScalable)
16628+
ValuesToInterleave.push_back(Pred);
16629+
ValuesToInterleave.push_back(BaseAddr);
1657216630
for (unsigned I = 0; I < NumStores; ++I) {
1657316631
Value *Address = BaseAddr;
1657416632
if (NumStores > 1) {
1657516633
Value *Offset = Builder.getInt64(I * Factor);
1657616634
Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16577-
1657816635
Value *Idx =
1657916636
Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16580-
L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16581-
R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16637+
for (int J = 0; J < Factor; J++) {
16638+
ValuesToInterleave[J] =
16639+
Builder.CreateExtractVector(StTy, InterleaveOps[J], Idx);
16640+
}
16641+
// update the address
16642+
ValuesToInterleave[ValuesToInterleave.size() - 1] = Address;
1658216643
}
1658316644

16584-
if (UseScalable) {
16585-
SmallVector<Value *> Args(LeafNodes);
16586-
Args.push_back(Pred);
16587-
Args.push_back(Address);
16588-
Builder.CreateCall(StNFunc, Args);
16589-
} else
16590-
Builder.CreateCall(StNFunc, {L, R, Address});
16645+
Builder.CreateCall(StNFunc, ValuesToInterleave);
1659116646
}
1659216647

1659316648
return true;

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -686,11 +686,9 @@ class AArch64TargetLowering : public TargetLowering {
686686
unsigned Factor) const override;
687687

688688
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
689-
SmallVector<Value *> &LeafNodes,
690689
LoadInst *LI) const override;
691690

692691
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
693-
SmallVector<Value *> &LeafNodes,
694692
StoreInst *SI) const override;
695693

696694
bool isLegalAddImmediate(int64_t) const override;

0 commit comments

Comments
 (0)