@@ -10834,7 +10834,8 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
10834
10834
if (Depth >= 6)
10835
10835
return std::nullopt;
10836
10836
10837
- if (Op.getValueSizeInBits() < 8)
10837
+ auto ValueSize = Op.getValueSizeInBits();
10838
+ if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
10838
10839
return std::nullopt;
10839
10840
10840
10841
switch (Op->getOpcode()) {
@@ -11125,6 +11126,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11125
11126
auto VecIdx = IdxOp->getZExtValue();
11126
11127
auto ScalarSize = Op.getScalarValueSizeInBits();
11127
11128
if (ScalarSize != 32) {
11129
+ if ((VecIdx + 1) * ScalarSize > 32)
11130
+ return std::nullopt;
11128
11131
Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
11129
11132
}
11130
11133
@@ -11210,6 +11213,9 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
11210
11213
int Low16 = PermMask & 0xffff;
11211
11214
int Hi16 = (PermMask & 0xffff0000) >> 16;
11212
11215
11216
+ assert(Op.getValueType().isByteSized());
11217
+ assert(OtherOp.getValueType().isByteSized());
11218
+
11213
11219
auto TempOp = peekThroughBitcasts(Op);
11214
11220
auto TempOtherOp = peekThroughBitcasts(OtherOp);
11215
11221
@@ -11227,38 +11233,15 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
11227
11233
return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
11228
11234
}
11229
11235
11230
- static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
11231
- unsigned DWordOffset) {
11232
- SDValue Ret;
11233
- if (Src.getValueSizeInBits() <= 32)
11234
- return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
11235
-
11236
- if (Src.getValueSizeInBits() >= 256) {
11237
- assert(!(Src.getValueSizeInBits() % 32));
11238
- Ret = DAG.getBitcast(
11239
- MVT::getVectorVT(MVT::i32, Src.getValueSizeInBits() / 32), Src);
11240
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ret,
11241
- DAG.getConstant(DWordOffset, SL, MVT::i32));
11242
- }
11243
-
11244
- Ret = DAG.getBitcastedAnyExtOrTrunc(
11245
- Src, SL, MVT::getIntegerVT(Src.getValueSizeInBits()));
11246
- if (DWordOffset) {
11247
- auto Shifted = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
11248
- DAG.getConstant(DWordOffset * 32, SL, MVT::i32));
11249
- return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Shifted);
11250
- }
11251
-
11252
- return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
11253
- }
11254
-
11255
11236
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
11256
11237
SelectionDAG &DAG = DCI.DAG;
11257
- [[maybe_unused]] EVT VT = N->getValueType(0);
11258
- SmallVector<ByteProvider<SDValue>, 8> PermNodes;
11238
+ EVT VT = N->getValueType(0);
11239
+
11240
+ if (VT != MVT::i32)
11241
+ return SDValue();
11259
11242
11260
11243
// VT is known to be MVT::i32, so we need to provide 4 bytes.
11261
- assert(VT == MVT::i32) ;
11244
+ SmallVector<ByteProvider<SDValue>, 8> PermNodes ;
11262
11245
for (int i = 0; i < 4; i++) {
11263
11246
// Find the ByteProvider that provides the ith byte of the result of OR
11264
11247
std::optional<ByteProvider<SDValue>> P =
@@ -11272,40 +11255,42 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
11272
11255
if (PermNodes.size() != 4)
11273
11256
return SDValue();
11274
11257
11275
- std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4) ;
11276
- std::optional<std::pair<unsigned, unsigned> > SecondSrc;
11258
+ int FirstSrc = 0 ;
11259
+ std::optional<int > SecondSrc;
11277
11260
uint64_t PermMask = 0x00000000;
11278
11261
for (size_t i = 0; i < PermNodes.size(); i++) {
11279
11262
auto PermOp = PermNodes[i];
11280
11263
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
11281
11264
// by sizeof(Src2) = 4
11282
11265
int SrcByteAdjust = 4;
11283
11266
11284
- // If the Src uses a byte from a different DWORD, then it corresponds
11285
- // with a difference source
11286
- if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
11287
- ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
11288
- if (SecondSrc)
11289
- if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
11290
- ((PermOp.SrcOffset / 4) != SecondSrc->second))
11267
+ if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
11268
+ if (SecondSrc.has_value())
11269
+ if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
11291
11270
return SDValue();
11292
11271
11293
11272
// Set the index of the second distinct Src node
11294
- SecondSrc = {i, PermNodes[i].SrcOffset / 4} ;
11295
- assert(!(PermNodes[SecondSrc->first ].Src->getValueSizeInBits() % 8));
11273
+ SecondSrc = i ;
11274
+ assert(!(PermNodes[* SecondSrc].Src->getValueSizeInBits() % 8));
11296
11275
SrcByteAdjust = 0;
11297
11276
}
11298
- assert(( PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
11277
+ assert(PermOp.SrcOffset + SrcByteAdjust < 8);
11299
11278
assert(!DAG.getDataLayout().isBigEndian());
11300
- PermMask |= (( PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
11279
+ PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
11301
11280
}
11302
- SDLoc DL(N);
11303
- SDValue Op = *PermNodes[FirstSrc.first].Src;
11304
- Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
11305
- assert(Op.getValueSizeInBits() == 32);
11281
+
11282
+ SDValue Op = *PermNodes[FirstSrc].Src;
11283
+ SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
11284
+ : *PermNodes[FirstSrc].Src;
11285
+
11286
+ // Check that we haven't just recreated the same FSHR node.
11287
+ if (N->getOpcode() == ISD::FSHR &&
11288
+ (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
11289
+ (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
11290
+ return SDValue();
11306
11291
11307
11292
// Check that we are not just extracting the bytes in order from an op
11308
- if (!SecondSrc ) {
11293
+ if (Op == OtherOp && Op.getValueSizeInBits() == 32 ) {
11309
11294
int Low16 = PermMask & 0xffff;
11310
11295
int Hi16 = (PermMask & 0xffff0000) >> 16;
11311
11296
@@ -11317,16 +11302,8 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
11317
11302
return DAG.getBitcast(MVT::getIntegerVT(32), Op);
11318
11303
}
11319
11304
11320
- SDValue OtherOp =
11321
- SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op;
11322
-
11323
- if (SecondSrc)
11324
- OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
11325
-
11326
- assert(Op.getValueSizeInBits() == 32);
11327
-
11328
11305
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
11329
-
11306
+ SDLoc DL(N);
11330
11307
assert(Op.getValueType().isByteSized() &&
11331
11308
OtherOp.getValueType().isByteSized());
11332
11309
@@ -11341,6 +11318,7 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
11341
11318
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
11342
11319
DAG.getConstant(PermMask, DL, MVT::i32));
11343
11320
}
11321
+
11344
11322
return SDValue();
11345
11323
}
11346
11324
@@ -12816,24 +12794,17 @@ static unsigned addPermMasks(unsigned First, unsigned Second) {
12816
12794
return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
12817
12795
}
12818
12796
12819
- struct DotSrc {
12820
- SDValue SrcOp;
12821
- int64_t PermMask;
12822
- int64_t DWordOffset;
12823
- };
12824
-
12825
12797
static void placeSources(ByteProvider<SDValue> &Src0,
12826
12798
ByteProvider<SDValue> &Src1,
12827
- SmallVectorImpl<DotSrc> &Src0s,
12828
- SmallVectorImpl<DotSrc> &Src1s, int Step) {
12799
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
12800
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
12801
+ int Step) {
12829
12802
12830
12803
assert(Src0.Src.has_value() && Src1.Src.has_value());
12831
12804
// Src0s and Src1s are empty, just place arbitrarily.
12832
12805
if (Step == 0) {
12833
- Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
12834
- Src0.SrcOffset / 4});
12835
- Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
12836
- Src1.SrcOffset / 4});
12806
+ Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
12807
+ Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
12837
12808
return;
12838
12809
}
12839
12810
@@ -12846,38 +12817,38 @@ static void placeSources(ByteProvider<SDValue> &Src0,
12846
12817
unsigned FMask = 0xFF << (8 * (3 - Step));
12847
12818
12848
12819
unsigned FirstMask =
12849
- ( BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
12820
+ BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
12850
12821
unsigned SecondMask =
12851
- ( BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
12822
+ BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
12852
12823
// Attempt to find Src vector which contains our SDValue, if so, add our
12853
12824
// perm mask to the existing one. If we are unable to find a match for the
12854
12825
// first SDValue, attempt to find match for the second.
12855
12826
int FirstGroup = -1;
12856
12827
for (int I = 0; I < 2; I++) {
12857
- SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
12858
- auto MatchesFirst = [&BPP](DotSrc &IterElt) {
12859
- return IterElt.SrcOp == * BPP.first.Src &&
12860
- ( IterElt.DWordOffset == ( BPP.first.SrcOffset / 4)) ;
12828
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
12829
+ I == 0 ? Src0s : Src1s;
12830
+ auto MatchesFirst = [& BPP](std::pair<SDValue, unsigned> IterElt) {
12831
+ return IterElt.first == * BPP.first.Src ;
12861
12832
};
12862
12833
12863
12834
auto Match = llvm::find_if(Srcs, MatchesFirst);
12864
12835
if (Match != Srcs.end()) {
12865
- Match->PermMask = addPermMasks(FirstMask, Match->PermMask );
12836
+ Match->second = addPermMasks(FirstMask, Match->second );
12866
12837
FirstGroup = I;
12867
12838
break;
12868
12839
}
12869
12840
}
12870
12841
if (FirstGroup != -1) {
12871
- SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
12872
- auto MatchesSecond = [&BPP](DotSrc &IterElt) {
12873
- return IterElt.SrcOp == * BPP.second.Src &&
12874
- ( IterElt.DWordOffset == ( BPP.second.SrcOffset / 4)) ;
12842
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
12843
+ FirstGroup == 1 ? Src0s : Src1s;
12844
+ auto MatchesSecond = [& BPP](std::pair<SDValue, unsigned> IterElt) {
12845
+ return IterElt.first == * BPP.second.Src ;
12875
12846
};
12876
12847
auto Match = llvm::find_if(Srcs, MatchesSecond);
12877
12848
if (Match != Srcs.end()) {
12878
- Match->PermMask = addPermMasks(SecondMask, Match->PermMask );
12849
+ Match->second = addPermMasks(SecondMask, Match->second );
12879
12850
} else
12880
- Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4 });
12851
+ Srcs.push_back({*BPP.second.Src, SecondMask});
12881
12852
return;
12882
12853
}
12883
12854
}
@@ -12889,32 +12860,29 @@ static void placeSources(ByteProvider<SDValue> &Src0,
12889
12860
unsigned FMask = 0xFF << (8 * (3 - Step));
12890
12861
12891
12862
Src0s.push_back(
12892
- {*Src0.Src,
12893
- ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
12894
- Src1.SrcOffset / 4});
12863
+ {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
12895
12864
Src1s.push_back(
12896
- {*Src1.Src,
12897
- ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
12898
- Src1.SrcOffset / 4});
12865
+ {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
12899
12866
12900
12867
return;
12901
12868
}
12902
12869
12903
- static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
12904
- SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
12905
- bool IsAny) {
12870
+ static SDValue
12871
+ resolveSources(SelectionDAG &DAG, SDLoc SL,
12872
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
12873
+ bool IsSigned, bool IsAny) {
12906
12874
12907
12875
// If we just have one source, just permute it accordingly.
12908
12876
if (Srcs.size() == 1) {
12909
12877
auto Elt = Srcs.begin();
12910
- auto EltOp = getDWordFromOffset( DAG, SL, Elt->SrcOp, Elt->DWordOffset );
12878
+ auto EltVal = DAG.getBitcastedAnyExtOrTrunc( Elt->first, SL, MVT::i32 );
12911
12879
12912
- // v_perm will produce the original value
12913
- if (Elt->PermMask == 0x3020100)
12914
- return EltOp ;
12880
+ // v_perm will produce the original value.
12881
+ if (Elt->second == 0x3020100)
12882
+ return EltVal ;
12915
12883
12916
- return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp ,
12917
- DAG.getConstant(Elt->PermMask , SL, MVT::i32));
12884
+ return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal ,
12885
+ DAG.getConstant(Elt->second , SL, MVT::i32));
12918
12886
}
12919
12887
12920
12888
auto FirstElt = Srcs.begin();
@@ -12925,8 +12893,8 @@ static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
12925
12893
// If we have multiple sources in the chain, combine them via perms (using
12926
12894
// calculated perm mask) and Ors.
12927
12895
while (true) {
12928
- auto FirstMask = FirstElt->PermMask ;
12929
- auto SecondMask = SecondElt->PermMask ;
12896
+ auto FirstMask = FirstElt->second ;
12897
+ auto SecondMask = SecondElt->second ;
12930
12898
12931
12899
unsigned FirstCs = FirstMask & 0x0c0c0c0c;
12932
12900
unsigned FirstPlusFour = FirstMask | 0x04040404;
@@ -12936,9 +12904,9 @@ static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
12936
12904
12937
12905
auto PermMask = addPermMasks(FirstMask, SecondMask);
12938
12906
auto FirstVal =
12939
- getDWordFromOffset( DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset );
12907
+ DAG.getBitcastedAnyExtOrTrunc( FirstElt->first, SL, MVT::i32 );
12940
12908
auto SecondVal =
12941
- getDWordFromOffset( DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset );
12909
+ DAG.getBitcastedAnyExtOrTrunc( SecondElt->first, SL, MVT::i32 );
12942
12910
12943
12911
Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
12944
12912
SecondVal,
@@ -12952,12 +12920,12 @@ static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
12952
12920
// If we only have a FirstElt, then just combine that into the cumulative
12953
12921
// source node.
12954
12922
if (SecondElt == Srcs.end()) {
12955
- auto EltOp =
12956
- getDWordFromOffset( DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset );
12923
+ auto EltVal =
12924
+ DAG.getBitcastedAnyExtOrTrunc( FirstElt->first, SL, MVT::i32 );
12957
12925
12958
12926
Perms.push_back(
12959
- DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp ,
12960
- DAG.getConstant(FirstElt->PermMask , SL, MVT::i32)));
12927
+ DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal ,
12928
+ DAG.getConstant(FirstElt->second , SL, MVT::i32)));
12961
12929
break;
12962
12930
}
12963
12931
}
@@ -12968,8 +12936,9 @@ static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
12968
12936
: Perms[0];
12969
12937
}
12970
12938
12971
- static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
12972
- for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
12939
+ static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
12940
+ unsigned ChainLength) {
12941
+ for (auto &[EntryVal, EntryMask] : Srcs) {
12973
12942
EntryMask = EntryMask >> ((4 - ChainLength) * 8);
12974
12943
auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
12975
12944
EntryMask += ZeroMask;
@@ -13034,8 +13003,8 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
13034
13003
(Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
13035
13004
SDValue TempNode(N, 0);
13036
13005
std::optional<bool> IsSigned;
13037
- SmallVector<DotSrc , 4> Src0s;
13038
- SmallVector<DotSrc , 4> Src1s;
13006
+ SmallVector<std::pair<SDValue, unsigned> , 4> Src0s;
13007
+ SmallVector<std::pair<SDValue, unsigned> , 4> Src1s;
13039
13008
SmallVector<SDValue, 4> Src2s;
13040
13009
13041
13010
// Match the v_dot4 tree, while collecting src nodes.
@@ -13113,11 +13082,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
13113
13082
// (commutation).
13114
13083
bool UseOriginalSrc = false;
13115
13084
if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
13116
- Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
13117
- Src0s.begin()->SrcOp .getValueSizeInBits() > = 32 &&
13118
- Src1s.begin()->SrcOp .getValueSizeInBits() > = 32) {
13085
+ Src0s.begin()->second == Src1s.begin()->second &&
13086
+ Src0s.begin()->first .getValueSizeInBits() = = 32 &&
13087
+ Src1s.begin()->first .getValueSizeInBits() = = 32) {
13119
13088
SmallVector<unsigned, 4> SrcBytes;
13120
- auto Src0Mask = Src0s.begin()->PermMask ;
13089
+ auto Src0Mask = Src0s.begin()->second ;
13121
13090
SrcBytes.push_back(Src0Mask & 0xFF000000);
13122
13091
bool UniqueEntries = true;
13123
13092
for (auto I = 1; I < 4; I++) {
@@ -13132,19 +13101,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
13132
13101
13133
13102
if (UniqueEntries) {
13134
13103
UseOriginalSrc = true;
13135
-
13136
- auto FirstElt = Src0s.begin();
13137
- auto FirstEltOp =
13138
- getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13139
-
13140
- auto SecondElt = Src1s.begin();
13141
- auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
13142
- SecondElt->DWordOffset);
13143
-
13144
- Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
13145
- MVT::getIntegerVT(32));
13146
- Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
13147
- MVT::getIntegerVT(32));
13104
+ // Must be 32 bits to enter above conditional.
13105
+ assert(Src0s.begin()->first.getValueSizeInBits() == 32);
13106
+ assert(Src1s.begin()->first.getValueSizeInBits() == 32);
13107
+ Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
13108
+ Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
13148
13109
}
13149
13110
}
13150
13111
0 commit comments