@@ -16906,71 +16906,120 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
16906
16906
return true;
16907
16907
}
16908
16908
16909
- bool getDeinterleavedValues (
16909
+ bool getDeinterleave2Values (
16910
16910
Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues) {
16911
- if (!DI->hasNUsesOrMore (2))
16911
+ if (!DI->hasNUses (2))
16912
16912
return false;
16913
16913
auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
16914
16914
auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
16915
16915
if (!Extr1 || !Extr2)
16916
16916
return false;
16917
16917
16918
- if (!Extr1->hasNUsesOrMore(1) || !Extr2->hasNUsesOrMore(1))
16918
+ DeinterleavedValues.resize(2);
16919
+ // Place the values into the vector in the order of extraction:
16920
+ DeinterleavedValues[0x1 & (Extr1->getIndices()[0])] = Extr1;
16921
+ DeinterleavedValues[0x1 & (Extr2->getIndices()[0])] = Extr2;
16922
+ if (!DeinterleavedValues[0] || !DeinterleavedValues[1])
16923
+ return false;
16924
+
16925
+ // Make sure that the extracted values match the deinterleave tree pattern
16926
+ if (!match(DeinterleavedValues[0], m_ExtractValue<0>((m_Specific(DI)))) ||
16927
+ !match(DeinterleavedValues[1], m_ExtractValue<1>((m_Specific(DI))))) {
16928
+ LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n");
16929
+ return false;
16930
+ }
16931
+ return true;
16932
+ }
16933
+
16934
+ /*
16935
+ Diagram for DI tree.
16936
+ [LOAD]
16937
+ |
16938
+ [DI]
16939
+ / \
16940
+ [Extr<0>] [Extr<1>]
16941
+ | |
16942
+ [DI] [DI]
16943
+ / \ / \
16944
+ [Extr<0>][Extr<1>] [Extr<0>][Extr<1>]
16945
+ | | | |
16946
+ roots: A C B D
16947
+ roots in correct order of DI4: A B C D.
16948
+ If there is a pattern matches the deinterleave tree above, then we can construct
16949
+ DI4 out of that pattern. This function tries to match the deinterleave tree
16950
+ pattern, and fetch the tree roots, so that in further steps they can be replaced
16951
+ by the output of DI4.
16952
+ */
16953
+ bool getDeinterleave4Values(Value *DI,
16954
+ SmallVectorImpl<Instruction *> &DeinterleavedValues,
16955
+ SmallVectorImpl<Instruction *> &DeadInstructions) {
16956
+ if (!DI->hasNUses(2))
16957
+ return false;
16958
+ auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
16959
+ auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
16960
+ if (!Extr1 || !Extr2)
16961
+ return false;
16962
+
16963
+ if (!Extr1->hasNUses(1) || !Extr2->hasNUses(1))
16919
16964
return false;
16920
16965
auto *DI1 = *(Extr1->user_begin());
16921
16966
auto *DI2 = *(Extr2->user_begin());
16922
16967
16923
- if (!DI1->hasNUsesOrMore (2) || !DI2->hasNUsesOrMore (2))
16968
+ if (!DI1->hasNUses (2) || !DI2->hasNUses (2))
16924
16969
return false;
16925
16970
// Leaf nodes of the deinterleave tree:
16926
16971
auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
16927
- auto *B = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
16928
- auto *C = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
16972
+ auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
16973
+ auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
16929
16974
auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
16930
16975
// Make sure that the A,B,C,D are instructions of ExtractValue,
16931
16976
// before getting the extract index
16932
16977
if (!A || !B || !C || !D)
16933
16978
return false;
16934
16979
16935
16980
DeinterleavedValues.resize(4);
16936
- // Place the values into the vector in the order of extraction:
16937
- DeinterleavedValues[A->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = A;
16938
- DeinterleavedValues[B->getIndices()[0] + (Extr1->getIndices()[0] * 2)] = B;
16939
- DeinterleavedValues[C->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = C;
16940
- DeinterleavedValues[D->getIndices()[0] + (Extr2->getIndices()[0] * 2)] = D;
16981
+ // Place the values into the vector in the order of deinterleave4:
16982
+ DeinterleavedValues[0x3 &
16983
+ ((A->getIndices()[0] * 2) + Extr1->getIndices()[0])] = A;
16984
+ DeinterleavedValues[0x3 &
16985
+ ((B->getIndices()[0] * 2) + Extr2->getIndices()[0])] = B;
16986
+ DeinterleavedValues[0x3 &
16987
+ ((C->getIndices()[0] * 2) + Extr1->getIndices()[0])] = C;
16988
+ DeinterleavedValues[0x3 &
16989
+ ((D->getIndices()[0] * 2) + Extr2->getIndices()[0])] = D;
16990
+ if (!DeinterleavedValues[0] || !DeinterleavedValues[1] ||
16991
+ !DeinterleavedValues[2] || !DeinterleavedValues[3])
16992
+ return false;
16941
16993
16942
16994
// Make sure that A,B,C,D match the deinterleave tree pattern
16943
- if (!match(DeinterleavedValues[0],
16944
- m_ExtractValue<0>(m_Deinterleave2(
16945
- m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
16946
- !match(DeinterleavedValues[1],
16947
- m_ExtractValue<1>(m_Deinterleave2(
16948
- m_ExtractValue<0>(m_Deinterleave2(m_Value()))))) ||
16949
- !match(DeinterleavedValues[2],
16950
- m_ExtractValue<0>(m_Deinterleave2(
16951
- m_ExtractValue<1>(m_Deinterleave2(m_Value()))))) ||
16952
- !match(DeinterleavedValues[3],
16953
- m_ExtractValue<1>(m_Deinterleave2(
16954
- m_ExtractValue<1>(m_Deinterleave2(m_Value())))))) {
16995
+ if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2(
16996
+ m_ExtractValue<0>(m_Specific(DI))))) ||
16997
+ !match(DeinterleavedValues[1], m_ExtractValue<0>(m_Deinterleave2(
16998
+ m_ExtractValue<1>(m_Specific(DI))))) ||
16999
+ !match(DeinterleavedValues[2], m_ExtractValue<1>(m_Deinterleave2(
17000
+ m_ExtractValue<0>(m_Specific(DI))))) ||
17001
+ !match(DeinterleavedValues[3], m_ExtractValue<1>(m_Deinterleave2(
17002
+ m_ExtractValue<1>(m_Specific(DI)))))) {
16955
17003
LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
16956
17004
return false;
16957
17005
}
16958
- // Order the values according to the deinterleaving order.
16959
- std::swap(DeinterleavedValues[1], DeinterleavedValues[2]);
17006
+
17007
+ // These Values will not be used anymre,
17008
+ // DI4 will be created instead of nested DI1 and DI2
17009
+ DeadInstructions.push_back(cast<Instruction>(DI1));
17010
+ DeadInstructions.push_back(cast<Instruction>(Extr1));
17011
+ DeadInstructions.push_back(cast<Instruction>(DI2));
17012
+ DeadInstructions.push_back(cast<Instruction>(Extr2));
17013
+
16960
17014
return true;
16961
17015
}
16962
17016
16963
- void deleteDeadDeinterleaveInstructions(Instruction *DeadRoot) {
16964
- Value *DeadDeinterleave = nullptr, *DeadExtract = nullptr;
16965
- match(DeadRoot, m_ExtractValue(m_Value(DeadDeinterleave)));
16966
- assert(DeadDeinterleave != nullptr && "Match is expected to succeed");
16967
- match(DeadDeinterleave, m_Deinterleave2(m_Value(DeadExtract)));
16968
- assert(DeadExtract != nullptr && "Match is expected to succeed");
16969
- DeadRoot->eraseFromParent();
16970
- if (DeadDeinterleave->getNumUses() == 0)
16971
- cast<Instruction>(DeadDeinterleave)->eraseFromParent();
16972
- if (DeadExtract->getNumUses() == 0)
16973
- cast<Instruction>(DeadExtract)->eraseFromParent();
17017
+ bool getDeinterleavedValues(Value *DI,
17018
+ SmallVectorImpl<Instruction *> &DeinterleavedValues,
17019
+ SmallVectorImpl<Instruction *> &DeadInstructions) {
17020
+ if (getDeinterleave4Values(DI, DeinterleavedValues, DeadInstructions))
17021
+ return true;
17022
+ return getDeinterleave2Values(DI, DeinterleavedValues);
16974
17023
}
16975
17024
16976
17025
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
@@ -16980,16 +17029,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16980
17029
return false;
16981
17030
16982
17031
SmallVector<Instruction *, 4> DeinterleavedValues;
17032
+ SmallVector<Instruction *, 4> DeadInstructions;
16983
17033
const DataLayout &DL = DI->getModule()->getDataLayout();
16984
- unsigned Factor = 2;
16985
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16986
17034
16987
- if (getDeinterleavedValues(DI, DeinterleavedValues)) {
16988
- Factor = DeinterleavedValues.size( );
16989
- VTy = cast<VectorType>(DeinterleavedValues[0]->getType()) ;
17035
+ if (! getDeinterleavedValues(DI, DeinterleavedValues, DeadInstructions )) {
17036
+ LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n" );
17037
+ return false ;
16990
17038
}
17039
+ unsigned Factor = DeinterleavedValues.size();
16991
17040
assert((Factor == 2 || Factor == 4) &&
16992
- "Currently supported Factors are 2 or 4");
17041
+ "Currently supported Factor is 2 or 4 only");
17042
+ VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
16993
17043
16994
17044
bool UseScalable;
16995
17045
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -17050,23 +17100,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17050
17100
else
17051
17101
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17052
17102
}
17053
- if (Factor > 2) {
17054
- // Itereate over old deinterleaved values to replace it by
17055
- // the new deinterleaved values.
17056
- for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
17057
- Value *NewExtract = Builder.CreateExtractValue(Result, I);
17058
- DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17059
- }
17060
- for (unsigned I = 0; I < DeinterleavedValues.size(); I++)
17061
- deleteDeadDeinterleaveInstructions(DeinterleavedValues[I]);
17062
- return true;
17103
+ // Itereate over old deinterleaved values to replace it by
17104
+ // the new values.
17105
+ for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
17106
+ Value *NewExtract = Builder.CreateExtractValue(Result, I);
17107
+ DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17108
+ cast<Instruction>(DeinterleavedValues[I])->eraseFromParent();
17063
17109
}
17064
- DI->replaceAllUsesWith(Result);
17110
+ for (auto &dead : DeadInstructions)
17111
+ dead->eraseFromParent();
17065
17112
return true;
17066
17113
}
17067
17114
17068
- bool getValuesToInterleaved(Value *II,
17069
- SmallVectorImpl<Value *> &ValuesToInterleave) {
17115
+ /*
17116
+ Diagram for Interleave tree.
17117
+ A C B D
17118
+ \ / \ /
17119
+ [Interleave] [Interleave]
17120
+ \ /
17121
+ [Interleave]
17122
+ |
17123
+ [Store]
17124
+ values in correct order of interleave4: A B C D.
17125
+ If there is a pattern matches the interleave tree above, then we can construct
17126
+ Interleave4 out of that pattern. This function tries to match the interleave
17127
+ tree pattern, and fetch the values that we want to interleave, so that in
17128
+ further steps they can be replaced by the output of Inteleave4.
17129
+ */
17130
+ bool getValuesToInterleave(Value *II,
17131
+ SmallVectorImpl<Value *> &ValuesToInterleave) {
17070
17132
Value *A, *B, *C, *D;
17071
17133
// Try to match interleave of Factor 4
17072
17134
if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
@@ -17090,14 +17152,18 @@ bool getValuesToInterleaved(Value *II,
17090
17152
17091
17153
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
17092
17154
IntrinsicInst *II, StoreInst *SI) const {
17093
- LLVM_DEBUG(dbgs() << "lowerInterleaveIntrinsicToStore\n");
17155
+ // Only interleave2 supported at present.
17156
+ if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
17157
+ return false;
17094
17158
17095
17159
SmallVector<Value *, 4> ValuesToInterleave;
17096
- if (!getValuesToInterleaved(II, ValuesToInterleave))
17160
+ if (!getValuesToInterleave(II, ValuesToInterleave)) {
17161
+ LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17097
17162
return false;
17163
+ }
17098
17164
unsigned Factor = ValuesToInterleave.size();
17099
17165
assert((Factor == 2 || Factor == 4) &&
17100
- "Currently supported Factors are 2 or 4");
17166
+ "Currently supported Factor is 2 or 4 only ");
17101
17167
VectorType *VTy = cast<VectorType>(ValuesToInterleave[0]->getType());
17102
17168
const DataLayout &DL = II->getModule()->getDataLayout();
17103
17169
0 commit comments