Skip to content

Commit 71be020

Browse files
Ting WangTing Wang
authored andcommitted
[SelectionDAG][PowerPC] Memset reuse vector element for tail store
On PPC there are instructions to store element from vector(e.g. stxsdx/stxsiwx), and these instructions can be leveraged to avoid tail constant in memset and constant splat array initialization. This patch tries to explore these opportunities. Reviewed By: shchenz Differential Revision: https://reviews.llvm.org/D138883
1 parent ebf0169 commit 71be020

File tree

6 files changed

+149
-166
lines changed

6 files changed

+149
-166
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,15 @@ class TargetLoweringBase {
845845
return false;
846846
}
847847

848+
/// Return true if the target shall perform extract vector element and store
849+
/// given that the vector is known to be splat of constant.
850+
/// \p Index[out] gives the index of the vector element to be extracted when
851+
/// this is true.
852+
virtual bool shallExtractConstSplatVectorElementToStore(
853+
Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
854+
return false;
855+
}
856+
848857
/// Return true if inserting a scalar into a variable element of an undef
849858
/// vector is more efficiently handled by splatting the scalar instead.
850859
virtual bool shouldSplatInsEltVarIndex(EVT) const {

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7757,13 +7757,28 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
77577757
}
77587758

77597759
// If this store is smaller than the largest store see whether we can get
7760-
// the smaller value for free with a truncate.
7760+
// the smaller value for free with a truncate or extract vector element and
7761+
// then store.
77617762
SDValue Value = MemSetValue;
77627763
if (VT.bitsLT(LargestVT)) {
7764+
unsigned Index;
7765+
unsigned NElts = LargestVT.getSizeInBits() / VT.getSizeInBits();
7766+
EVT SVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), NElts);
77637767
if (!LargestVT.isVector() && !VT.isVector() &&
77647768
TLI.isTruncateFree(LargestVT, VT))
77657769
Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
7766-
else
7770+
else if (LargestVT.isVector() && !VT.isVector() &&
7771+
TLI.shallExtractConstSplatVectorElementToStore(
7772+
LargestVT.getTypeForEVT(*DAG.getContext()),
7773+
VT.getSizeInBits(), Index) &&
7774+
TLI.isTypeLegal(SVT) &&
7775+
LargestVT.getSizeInBits() == SVT.getSizeInBits()) {
7776+
// Target which can combine store(extractelement VectorTy, Idx) can get
7777+
// the smaller value for free.
7778+
SDValue TailValue = DAG.getNode(ISD::BITCAST, dl, SVT, MemSetValue);
7779+
Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, TailValue,
7780+
DAG.getVectorIdxConstant(Index, dl));
7781+
} else
77677782
Value = getMemsetValue(Src, VT, DAG, dl);
77687783
}
77697784
assert(Value.getValueType() == VT && "Value with wrong type.");

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1635,6 +1635,27 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
16351635
return VT.isScalarInteger();
16361636
}
16371637

1638+
bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
1639+
Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
1640+
if (!Subtarget.isPPC64() || !Subtarget.hasVSX())
1641+
return false;
1642+
1643+
if (auto *VTy = dyn_cast<VectorType>(VectorTy)) {
1644+
if (VTy->getScalarType()->isIntegerTy()) {
1645+
// ElemSizeInBits 8/16 can fit in immediate field, not needed here.
1646+
if (ElemSizeInBits == 32) {
1647+
Index = Subtarget.isLittleEndian() ? 2 : 1;
1648+
return true;
1649+
}
1650+
if (ElemSizeInBits == 64) {
1651+
Index = Subtarget.isLittleEndian() ? 1 : 0;
1652+
return true;
1653+
}
1654+
}
1655+
}
1656+
return false;
1657+
}
1658+
16381659
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
16391660
switch ((PPCISD::NodeType)Opcode) {
16401661
case PPCISD::FIRST_NUMBER: break;
@@ -17086,10 +17107,20 @@ EVT PPCTargetLowering::getOptimalMemOpType(
1708617107
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
1708717108
// We should use Altivec/VSX loads and stores when available. For unaligned
1708817109
// addresses, unaligned VSX loads are only fast starting with the P8.
17089-
if (Subtarget.hasAltivec() && Op.size() >= 16 &&
17090-
(Op.isAligned(Align(16)) ||
17091-
((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
17092-
return MVT::v4i32;
17110+
if (Subtarget.hasAltivec() && Op.size() >= 16) {
17111+
if (Op.isMemset() && Subtarget.hasVSX()) {
17112+
uint64_t TailSize = Op.size() % 16;
17113+
// For memset lowering, EXTRACT_VECTOR_ELT tries to return constant
17114+
// element if vector element type matches tail store. For tail size
17115+
// 3/4, the tail store is i32, v4i32 cannot be used, need a legal one.
17116+
if (TailSize > 2 && TailSize <= 4) {
17117+
return MVT::v8i16;
17118+
}
17119+
return MVT::v4i32;
17120+
}
17121+
if (Op.isAligned(Align(16)) || Subtarget.hasP8Vector())
17122+
return MVT::v4i32;
17123+
}
1709317124
}
1709417125

1709517126
if (Subtarget.isPPC64()) {

llvm/lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,11 @@ namespace llvm {
791791
return true;
792792
}
793793

794+
bool
795+
shallExtractConstSplatVectorElementToStore(Type *VectorTy,
796+
unsigned ElemSizeInBits,
797+
unsigned &Index) const override;
798+
794799
bool isCtlzFast() const override {
795800
return true;
796801
}

llvm/lib/Target/PowerPC/PPCInstrP10.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,8 +2031,15 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
20312031
(v8i16 (COPY_TO_REGCLASS (LXVRHX ForceXForm:$src), VSRC))>;
20322032
def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 ForceXForm:$src)))),
20332033
(v16i8 (COPY_TO_REGCLASS (LXVRBX ForceXForm:$src), VSRC))>;
2034+
def : Pat<(store (i64 (extractelt v2i64:$A, 1)), ForceXForm:$src),
2035+
(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
20342036
}
20352037

2038+
let Predicates = [IsISA3_1, IsBigEndian] in {
2039+
def : Pat<(store (i64 (extractelt v2i64:$A, 0)), ForceXForm:$src),
2040+
(XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), ForceXForm:$src)>;
2041+
}
2042+
20362043
// FIXME: The swap is overkill when the shift amount is a constant.
20372044
// We should just fix the constant in the DAG.
20382045
let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {

0 commit comments

Comments
 (0)