Skip to content

Commit bff9aaf

Browse files
author
git apple-llvm automerger
committed
Merge commit 'e34b1d3c3f7f' from apple/master into swift/master-next
2 parents 7de4189 + e34b1d3 commit bff9aaf

File tree

7 files changed

+628
-632
lines changed

7 files changed

+628
-632
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 80 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -15550,53 +15550,94 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
1555015550
int NumElts = VT.getVectorNumElements();
1555115551
int NumLanes = VT.getSizeInBits() / 128;
1555215552
int NumEltsPerLane = NumElts / NumLanes;
15553+
bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15554+
15555+
/// Attempts to find a sublane permute with the given size
15556+
/// that gets all elements into their target lanes.
15557+
///
15558+
/// If successful, fills CrossLaneMask and InLaneMask and returns true.
15559+
/// If unsuccessful, returns false and may overwrite InLaneMask.
15560+
auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15561+
int NumSublanesPerLane = NumSublanes / NumLanes;
15562+
int NumEltsPerSublane = NumElts / NumSublanes;
15563+
15564+
SmallVector<int, 16> CrossLaneMask;
15565+
SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15566+
// CrossLaneMask but one entry == one sublane.
15567+
SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
1555315568

15554-
SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
15555-
SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
15556-
15557-
for (int i = 0; i != NumElts; ++i) {
15558-
int M = Mask[i];
15559-
if (M < 0)
15560-
continue;
15569+
for (int i = 0; i != NumElts; ++i) {
15570+
int M = Mask[i];
15571+
if (M < 0)
15572+
continue;
1556115573

15562-
// Ensure that each lane comes from a single source lane.
15563-
int SrcLane = M / NumEltsPerLane;
15564-
int DstLane = i / NumEltsPerLane;
15565-
if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
15566-
return SDValue();
15567-
SrcLaneMask[DstLane] = SrcLane;
15574+
int SrcSublane = M / NumEltsPerSublane;
15575+
int DstLane = i / NumEltsPerLane;
1556815576

15569-
PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
15570-
}
15577+
// We only need to get the elements into the right lane, not sublane.
15578+
// So search all sublanes that make up the destination lane.
15579+
bool Found = false;
15580+
int DstSubStart = DstLane * NumSublanesPerLane;
15581+
int DstSubEnd = DstSubStart + NumSublanesPerLane;
15582+
for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15583+
if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15584+
continue;
1557115585

15572-
// Make sure we set all elements of the lane mask, to avoid undef propagation.
15573-
SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
15574-
for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
15575-
int SrcLane = SrcLaneMask[DstLane];
15576-
if (0 <= SrcLane)
15577-
for (int j = 0; j != NumEltsPerLane; ++j) {
15578-
LaneMask[(DstLane * NumEltsPerLane) + j] =
15579-
(SrcLane * NumEltsPerLane) + j;
15586+
Found = true;
15587+
CrossLaneMaskLarge[DstSublane] = SrcSublane;
15588+
int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15589+
InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15590+
break;
1558015591
}
15581-
}
15592+
if (!Found)
15593+
return SDValue();
15594+
}
1558215595

15583-
// If we're only shuffling a single lowest lane and the rest are identity
15584-
// then don't bother.
15585-
// TODO - isShuffleMaskInputInPlace could be extended to something like this.
15586-
int NumIdentityLanes = 0;
15587-
bool OnlyShuffleLowestLane = true;
15588-
for (int i = 0; i != NumLanes; ++i) {
15589-
if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
15590-
i * NumEltsPerLane))
15591-
NumIdentityLanes++;
15592-
else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
15593-
OnlyShuffleLowestLane = false;
15594-
}
15595-
if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15596+
// Fill CrossLaneMask using CrossLaneMaskLarge.
15597+
narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15598+
15599+
if (!CanUseSublanes) {
15600+
// If we're only shuffling a single lowest lane and the rest are identity
15601+
// then don't bother.
15602+
// TODO - isShuffleMaskInputInPlace could be extended to something like
15603+
// this.
15604+
int NumIdentityLanes = 0;
15605+
bool OnlyShuffleLowestLane = true;
15606+
for (int i = 0; i != NumLanes; ++i) {
15607+
int LaneOffset = i * NumEltsPerLane;
15608+
if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15609+
i * NumEltsPerLane))
15610+
NumIdentityLanes++;
15611+
else if (CrossLaneMask[LaneOffset] != 0)
15612+
OnlyShuffleLowestLane = false;
15613+
}
15614+
if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15615+
return SDValue();
15616+
}
15617+
15618+
SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15619+
return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15620+
InLaneMask);
15621+
};
15622+
15623+
// First attempt a solution with full lanes.
15624+
if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15625+
return V;
15626+
15627+
// The rest of the solutions use sublanes.
15628+
if (!CanUseSublanes)
15629+
return SDValue();
15630+
15631+
// Then attempt a solution with 64-bit sublanes (vpermq).
15632+
if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15633+
return V;
15634+
15635+
// If that doesn't work and we have fast variable shuffle,
15636+
// attempt 32-bit sublanes (vpermd).
15637+
if (!Subtarget.hasFastVariableShuffle())
1559615638
return SDValue();
1559715639

15598-
SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
15599-
return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
15640+
return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
1560015641
}
1560115642

1560215643
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,28 +1132,50 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
11321132
; AVX1-NEXT: vmovdqu %xmm2, 16(%rdi)
11331133
; AVX1-NEXT: retq
11341134
;
1135-
; AVX2-LABEL: interleave_24i16_in:
1136-
; AVX2: # %bb.0:
1137-
; AVX2-NEXT: vmovdqu (%rsi), %xmm0
1138-
; AVX2-NEXT: vmovdqu (%rdx), %xmm1
1139-
; AVX2-NEXT: vmovdqu (%rcx), %xmm2
1140-
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1141-
; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
1142-
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1143-
; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1144-
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1145-
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1146-
; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4
1147-
; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1148-
; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1149-
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1150-
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1151-
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1152-
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1153-
; AVX2-NEXT: vmovdqu %xmm0, 32(%rdi)
1154-
; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
1155-
; AVX2-NEXT: vzeroupper
1156-
; AVX2-NEXT: retq
1135+
; AVX2-SLOW-LABEL: interleave_24i16_in:
1136+
; AVX2-SLOW: # %bb.0:
1137+
; AVX2-SLOW-NEXT: vmovdqu (%rsi), %xmm0
1138+
; AVX2-SLOW-NEXT: vmovdqu (%rdx), %xmm1
1139+
; AVX2-SLOW-NEXT: vmovdqu (%rcx), %xmm2
1140+
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1141+
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
1142+
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1143+
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1144+
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1145+
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1146+
; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
1147+
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1148+
; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1149+
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1150+
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1151+
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1152+
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1153+
; AVX2-SLOW-NEXT: vmovdqu %xmm0, 32(%rdi)
1154+
; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rdi)
1155+
; AVX2-SLOW-NEXT: vzeroupper
1156+
; AVX2-SLOW-NEXT: retq
1157+
;
1158+
; AVX2-FAST-LABEL: interleave_24i16_in:
1159+
; AVX2-FAST: # %bb.0:
1160+
; AVX2-FAST-NEXT: vmovdqu (%rsi), %xmm0
1161+
; AVX2-FAST-NEXT: vmovdqu (%rdx), %xmm1
1162+
; AVX2-FAST-NEXT: vmovdqu (%rcx), %xmm2
1163+
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1164+
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1165+
; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4
1166+
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
1167+
; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3
1168+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27]
1169+
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1170+
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1171+
; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1172+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1173+
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1174+
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1175+
; AVX2-FAST-NEXT: vmovdqu %xmm0, 32(%rdi)
1176+
; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rdi)
1177+
; AVX2-FAST-NEXT: vzeroupper
1178+
; AVX2-FAST-NEXT: retq
11571179
;
11581180
; XOP-LABEL: interleave_24i16_in:
11591181
; XOP: # %bb.0:

0 commit comments

Comments
 (0)