@@ -16907,7 +16907,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
16907
16907
}
16908
16908
16909
16909
bool getDeinterleave2Values(
16910
- Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues) {
16910
+ Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
16911
+ SmallVectorImpl<Instruction *> &DeadInsts) {
16911
16912
if (!DI->hasNUses(2))
16912
16913
return false;
16913
16914
auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
@@ -16928,13 +16929,13 @@ bool getDeinterleave2Values(
16928
16929
LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n");
16929
16930
return false;
16930
16931
}
16932
+ // DeinterleavedValues will be replace by output of ld2
16933
+ DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(), DeinterleavedValues.end());
16931
16934
return true;
16932
16935
}
16933
16936
16934
16937
/*
16935
- Diagram for DI tree.
16936
- [LOAD]
16937
- |
16938
+ DeinterleaveIntrinsic tree:
16938
16939
[DI]
16939
16940
/ \
16940
16941
[Extr<0>] [Extr<1>]
@@ -16944,23 +16945,22 @@ Diagram for DI tree.
16944
16945
[Extr<0>][Extr<1>] [Extr<0>][Extr<1>]
16945
16946
| | | |
16946
16947
roots: A C B D
16947
- roots in correct order of DI4: A B C D.
16948
- If there is a pattern matches the deinterleave tree above, then we can construct
16949
- DI4 out of that pattern. This function tries to match the deinterleave tree
16950
- pattern, and fetch the tree roots, so that in further steps they can be replaced
16951
- by the output of DI4.
16948
+ roots in correct order of DI4 will be: A B C D.
16949
+ Returns true if `DI` is the top of an IR tree that represents a theoretical vector.deinterleave4 intrinsic.
16950
+ When true is returned, `DeinterleavedValues` vector is populated with the results such an intrinsic would return:
16951
+ (i.e. {A, B, C, D } = vector.deinterleave4(...))
16952
16952
*/
16953
16953
bool getDeinterleave4Values(Value *DI,
16954
16954
SmallVectorImpl<Instruction *> &DeinterleavedValues,
16955
- SmallVectorImpl<Instruction *> &DeadInstructions ) {
16955
+ SmallVectorImpl<Instruction *> &DeadInsts ) {
16956
16956
if (!DI->hasNUses(2))
16957
16957
return false;
16958
16958
auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
16959
16959
auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
16960
16960
if (!Extr1 || !Extr2)
16961
16961
return false;
16962
16962
16963
- if (!Extr1->hasNUses(1 ) || !Extr2->hasNUses(1 ))
16963
+ if (!Extr1->hasOneUse( ) || !Extr2->hasOneUse( ))
16964
16964
return false;
16965
16965
auto *DI1 = *(Extr1->user_begin());
16966
16966
auto *DI2 = *(Extr2->user_begin());
@@ -16972,8 +16972,7 @@ bool getDeinterleave4Values(Value *DI,
16972
16972
auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
16973
16973
auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
16974
16974
auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
16975
- // Make sure that the A,B,C,D are instructions of ExtractValue,
16976
- // before getting the extract index
16975
+ // Make sure that the A,B,C and D are ExtractValue instructions before getting the extract index
16977
16976
if (!A || !B || !C || !D)
16978
16977
return false;
16979
16978
@@ -17004,35 +17003,35 @@ bool getDeinterleave4Values(Value *DI,
17004
17003
return false;
17005
17004
}
17006
17005
17007
- // These Values will not be used anymre ,
17006
+ // These Values will not be used anymore ,
17008
17007
// DI4 will be created instead of nested DI1 and DI2
17009
- DeadInstructions.push_back(cast<Instruction>(DI1));
17010
- DeadInstructions.push_back(cast<Instruction>(Extr1));
17011
- DeadInstructions.push_back(cast<Instruction>(DI2));
17012
- DeadInstructions.push_back(cast<Instruction>(Extr2));
17008
+ DeadInsts.insert(DeadInsts.end(), DeinterleavedValues.begin(), DeinterleavedValues.end());
17009
+ DeadInsts.push_back(cast<Instruction>(DI1));
17010
+ DeadInsts.push_back(cast<Instruction>(Extr1));
17011
+ DeadInsts.push_back(cast<Instruction>(DI2));
17012
+ DeadInsts.push_back(cast<Instruction>(Extr2));
17013
17013
17014
17014
return true;
17015
17015
}
17016
17016
17017
17017
bool getDeinterleavedValues(Value *DI,
17018
17018
SmallVectorImpl<Instruction *> &DeinterleavedValues,
17019
- SmallVectorImpl<Instruction *> &DeadInstructions ) {
17020
- if (getDeinterleave4Values(DI, DeinterleavedValues, DeadInstructions ))
17019
+ SmallVectorImpl<Instruction *> &DeadInsts ) {
17020
+ if (getDeinterleave4Values(DI, DeinterleavedValues, DeadInsts ))
17021
17021
return true;
17022
- return getDeinterleave2Values(DI, DeinterleavedValues);
17022
+ return getDeinterleave2Values(DI, DeinterleavedValues, DeadInsts );
17023
17023
}
17024
17024
17025
17025
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17026
- IntrinsicInst *DI, LoadInst *LI) const {
17026
+ IntrinsicInst *DI, LoadInst *LI, SmallVectorImpl<Instruction *> &DeadInsts ) const {
17027
17027
// Only deinterleave2 supported at present.
17028
17028
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
17029
17029
return false;
17030
17030
17031
17031
SmallVector<Instruction *, 4> DeinterleavedValues;
17032
- SmallVector<Instruction *, 4> DeadInstructions;
17033
17032
const DataLayout &DL = DI->getModule()->getDataLayout();
17034
17033
17035
- if (!getDeinterleavedValues(DI, DeinterleavedValues, DeadInstructions )) {
17034
+ if (!getDeinterleavedValues(DI, DeinterleavedValues, DeadInsts )) {
17036
17035
LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17037
17036
return false;
17038
17037
}
@@ -17042,13 +17041,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17042
17041
VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
17043
17042
17044
17043
bool UseScalable;
17045
- if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17044
+ if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) {
17045
+ DeadInsts.clear();
17046
17046
return false;
17047
+ }
17047
17048
17048
17049
// TODO: Add support for using SVE instructions with fixed types later, using
17049
17050
// the code from lowerInterleavedLoad to obtain the correct container type.
17050
- if (UseScalable && !VTy->isScalableTy())
17051
+ if (UseScalable && !VTy->isScalableTy()) {
17052
+ DeadInsts.clear();
17051
17053
return false;
17054
+ }
17052
17055
17053
17056
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17054
17057
VectorType *LdTy =
@@ -17066,10 +17069,9 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17066
17069
Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17067
17070
17068
17071
Value *BaseAddr = LI->getPointerOperand();
17069
- Value *Result;
17070
17072
if (NumLoads > 1) {
17071
- // Create multiple legal small ldN instead of a wide one .
17072
- SmallVector<Value *, 4> WideValues (Factor, ( PoisonValue::get(VTy) ));
17073
+ // Create multiple legal small ldN.
17074
+ SmallVector<Value *, 4> ExtractedLdValues (Factor, PoisonValue::get(VTy));
17073
17075
for (unsigned I = 0; I < NumLoads; ++I) {
17074
17076
Value *Offset = Builder.getInt64(I * Factor);
17075
17077
@@ -17082,53 +17084,45 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17082
17084
Value *Idx =
17083
17085
Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17084
17086
for (unsigned J = 0; J < Factor; ++J) {
17085
- WideValues [J] = Builder.CreateInsertVector(
17086
- VTy, WideValues [J], Builder.CreateExtractValue(LdN, J), Idx);
17087
+ ExtractedLdValues [J] = Builder.CreateInsertVector(
17088
+ VTy, ExtractedLdValues [J], Builder.CreateExtractValue(LdN, J), Idx);
17087
17089
}
17090
+ LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17088
17091
}
17089
- if (Factor == 2)
17090
- Result = PoisonValue::get(StructType::get(VTy, VTy));
17091
- else
17092
- Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
17093
- // Construct the wide result out of the small results.
17094
- for (unsigned J = 0; J < Factor; ++J) {
17095
- Result = Builder.CreateInsertValue(Result, WideValues[J], J);
17096
- }
17092
+ // Replcae output of deinterleave2 intrinsic by output of ldN2/ldN4
17093
+ for (unsigned J = 0; J < Factor; ++J)
17094
+ DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
17097
17095
} else {
17096
+ Value *Result;
17098
17097
if (UseScalable)
17099
17098
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17100
17099
else
17101
17100
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17101
+ // Replcae output of deinterleave2 intrinsic by output of ldN2/ldN4
17102
+ for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
17103
+ Value *NewExtract = Builder.CreateExtractValue(Result, I);
17104
+ DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17105
+ }
17102
17106
}
17103
- // Itereate over old deinterleaved values to replace it by
17104
- // the new values.
17105
- for (unsigned I = 0; I < DeinterleavedValues.size(); I++) {
17106
- Value *NewExtract = Builder.CreateExtractValue(Result, I);
17107
- DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
17108
- cast<Instruction>(DeinterleavedValues[I])->eraseFromParent();
17109
- }
17110
- for (auto &dead : DeadInstructions)
17111
- dead->eraseFromParent();
17112
17107
return true;
17113
17108
}
17114
17109
17115
17110
/*
17116
- Diagram for Interleave tree.
17111
+ InterleaveIntrinsic tree.
17117
17112
A C B D
17118
17113
\ / \ /
17119
- [Interleave ] [Interleave ]
17114
+ [II ] [II ]
17120
17115
\ /
17121
- [Interleave]
17122
- |
17123
- [Store]
17116
+ [II]
17117
+
17124
17118
values in correct order of interleave4: A B C D.
17125
- If there is a pattern matches the interleave tree above, then we can construct
17126
- Interleave4 out of that pattern. This function tries to match the interleave
17127
- tree pattern, and fetch the values that we want to interleave, so that in
17128
- further steps they can be replaced by the output of Inteleave4.
17119
+ Returns true if `II` is the root of an IR tree that represents a theoretical vector.interleave4 intrinsic.
17120
+ When true is returned, `ValuesToInterleave` vector is populated with the inputs such an intrinsic would take:
17121
+ (i.e. vector.interleave4(A, B, C, D)).
17129
17122
*/
17130
17123
bool getValuesToInterleave(Value *II,
17131
- SmallVectorImpl<Value *> &ValuesToInterleave) {
17124
+ SmallVectorImpl<Value *> &ValuesToInterleave,
17125
+ SmallVectorImpl<Instruction *> &DeadInsts) {
17132
17126
Value *A, *B, *C, *D;
17133
17127
// Try to match interleave of Factor 4
17134
17128
if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
@@ -17137,6 +17131,11 @@ bool getValuesToInterleave(Value *II,
17137
17131
ValuesToInterleave.push_back(B);
17138
17132
ValuesToInterleave.push_back(C);
17139
17133
ValuesToInterleave.push_back(D);
17134
+ // intermediate II will not be needed anymore
17135
+ Value *II1, *II2;
17136
+ assert(match(II, m_Interleave2(m_Value(II1), m_Value(II2))) && "II tree is expected");
17137
+ DeadInsts.push_back(cast<Instruction>(II1));
17138
+ DeadInsts.push_back(cast<Instruction>(II2));
17140
17139
return true;
17141
17140
}
17142
17141
@@ -17151,13 +17150,13 @@ bool getValuesToInterleave(Value *II,
17151
17150
}
17152
17151
17153
17152
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
17154
- IntrinsicInst *II, StoreInst *SI) const {
17153
+ IntrinsicInst *II, StoreInst *SI, SmallVectorImpl<Instruction *> &DeadInsts ) const {
17155
17154
// Only interleave2 supported at present.
17156
17155
if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
17157
17156
return false;
17158
17157
17159
17158
SmallVector<Value *, 4> ValuesToInterleave;
17160
- if (!getValuesToInterleave(II, ValuesToInterleave)) {
17159
+ if (!getValuesToInterleave(II, ValuesToInterleave, DeadInsts )) {
17161
17160
LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17162
17161
return false;
17163
17162
}
@@ -17168,13 +17167,17 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
17168
17167
const DataLayout &DL = II->getModule()->getDataLayout();
17169
17168
17170
17169
bool UseScalable;
17171
- if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17170
+ if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) {
17171
+ DeadInsts.clear();
17172
17172
return false;
17173
+ }
17173
17174
17174
17175
// TODO: Add support for using SVE instructions with fixed types later, using
17175
17176
// the code from lowerInterleavedStore to obtain the correct container type.
17176
- if (UseScalable && !VTy->isScalableTy())
17177
+ if (UseScalable && !VTy->isScalableTy()) {
17178
+ DeadInsts.clear();
17177
17179
return false;
17180
+ }
17178
17181
17179
17182
unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
17180
17183
0 commit comments