Skip to content

Commit 95ab426

Browse files
authored
[X86] Attempt to canonicalize vXf64 SHUFPD shuffle masks with undef elts to improve further folding (#116419)
Currently when creating a SHUFPD immediate mask, any undef shuffle elements are set to 0, which can limit options for further shuffle combining. This patch attempts to canonicalize the mask to improve folding: first by detecting a per-lane broadcast style mask (which can allow us to fold to UNPCK instead), and second ensure any undef elements are set to an 'inplace' value to improve chances of the SHUFPD later folding to a BLENDPD (or be bypassed in a SimplifyMultipleUseDemandedVectorElts call). This is very similar to canonicalization we already attempt in getV4X86ShuffleImm for vXi32/vXf32 SHUFPS/SHUFD shuffles.
1 parent 6f53ae6 commit 95ab426

9 files changed

+279
-244
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9926,6 +9926,42 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
99269926
return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
99279927
}
99289928

9929+
// Canonicalize SHUFPD mask to improve chances of further folding.
9930+
// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
9931+
static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
9932+
assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
9933+
"Unexpected SHUFPD mask size");
9934+
assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
9935+
"Unexpected SHUFPD mask elements");
9936+
9937+
// If the mask only uses one non-undef element, then fully 'splat' it to
9938+
// improve later broadcast matching.
9939+
int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9940+
assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9941+
9942+
int FirstElt = Mask[FirstIndex];
9943+
if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
9944+
count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
9945+
unsigned Imm = 0;
9946+
for (unsigned I = 0, E = Mask.size(); I != E; ++I)
9947+
Imm |= FirstElt << I;
9948+
return Imm;
9949+
}
9950+
9951+
// Attempt to keep any undef elements in place to improve chances of the
9952+
// shuffle becoming a (commutative) blend.
9953+
unsigned Imm = 0;
9954+
for (unsigned I = 0, E = Mask.size(); I != E; ++I)
9955+
Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
9956+
9957+
return Imm;
9958+
}
9959+
9960+
static SDValue getSHUFPDImmForMask(ArrayRef<int> Mask, const SDLoc &DL,
9961+
SelectionDAG &DAG) {
9962+
return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
9963+
}
9964+
99299965
// The Shuffle result is as follow:
99309966
// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
99319967
// Each Zeroable's element correspond to a particular Mask's element.
@@ -14871,7 +14907,7 @@ static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
1487114907

1487214908
int LHSMask[4] = {-1, -1, -1, -1};
1487314909
int RHSMask[4] = {-1, -1, -1, -1};
14874-
unsigned SHUFPMask = 0;
14910+
int SHUFPDMask[4] = {-1, -1, -1, -1};
1487514911

1487614912
// As SHUFPD uses a single LHS/RHS element per lane, we can always
1487714913
// perform the shuffle once the lanes have been shuffled in place.
@@ -14882,13 +14918,13 @@ static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
1488214918
int LaneBase = i & ~1;
1488314919
auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
1488414920
LaneMask[LaneBase + (M & 1)] = M;
14885-
SHUFPMask |= (M & 1) << i;
14921+
SHUFPDMask[i] = M & 1;
1488614922
}
1488714923

1488814924
SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
1488914925
SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
1489014926
return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14891-
DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14927+
getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
1489214928
}
1489314929

1489414930
/// Lower a vector shuffle crossing multiple 128-bit lanes as
@@ -15800,9 +15836,9 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
1580015836

1580115837
// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
1580215838
// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15803-
ShuffleImm = 0;
15804-
bool ShufpdMask = true;
15805-
bool CommutableMask = true;
15839+
bool IsSHUFPD = true;
15840+
bool IsCommutable = true;
15841+
SmallVector<int, 8> SHUFPDMask(NumElts, -1);
1580615842
for (int i = 0; i < NumElts; ++i) {
1580715843
if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
1580815844
continue;
@@ -15811,20 +15847,21 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
1581115847
int Val = (i & 6) + NumElts * (i & 1);
1581215848
int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
1581315849
if (Mask[i] < Val || Mask[i] > Val + 1)
15814-
ShufpdMask = false;
15850+
IsSHUFPD = false;
1581515851
if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15816-
CommutableMask = false;
15817-
ShuffleImm |= (Mask[i] % 2) << i;
15852+
IsCommutable = false;
15853+
SHUFPDMask[i] = Mask[i] % 2;
1581815854
}
1581915855

15820-
if (!ShufpdMask && !CommutableMask)
15856+
if (!IsSHUFPD && !IsCommutable)
1582115857
return false;
1582215858

15823-
if (!ShufpdMask && CommutableMask)
15859+
if (!IsSHUFPD && IsCommutable)
1582415860
std::swap(V1, V2);
1582515861

1582615862
ForceV1Zero = ZeroLane[0];
1582715863
ForceV2Zero = ZeroLane[1];
15864+
ShuffleImm = getSHUFPDImm(SHUFPDMask);
1582815865
return true;
1582915866
}
1583015867

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4756,7 +4756,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
47564756
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
47574757
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
47584758
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
4759-
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[2]
4759+
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
47604760
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
47614761
; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
47624762
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2

llvm/test/CodeGen/X86/subvector-broadcast.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1667,13 +1667,13 @@ define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(ptr %vp, <8 x float> %default
16671667
; X86: # %bb.0:
16681668
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
16691669
; X86-NEXT: vbroadcastsd (%eax), %ymm1
1670-
; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1670+
; X86-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16711671
; X86-NEXT: retl
16721672
;
16731673
; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
16741674
; X64: # %bb.0:
16751675
; X64-NEXT: vbroadcastsd (%rdi), %ymm1
1676-
; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1676+
; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
16771677
; X64-NEXT: retq
16781678
%vec = load <2 x float>, ptr %vp
16791679
%shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>

0 commit comments

Comments
 (0)