Skip to content

Commit 72e242a

Browse files
committed
[X86][AVX] canonicalizeShuffleMaskWithHorizOp - improve support for 256/512-bit vectors
Extend the HOP(HOP(X,Y),HOP(Z,W)) and SHUFFLE(HOP(X,Y),HOP(Z,W)) folds to handle repeating 256/512-bit vector cases. This allows us to drop the UNPACK(HOP(),HOP()) custom fold in combineTargetShuffle. This required isRepeatedTargetShuffleMask to be tweaked to support target shuffle masks taking more than 2 inputs.
1 parent 81900dc commit 72e242a

File tree

2 files changed

+17
-42
lines changed

2 files changed

+17
-42
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 13 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10964,10 +10964,10 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
1096410964
// This entry crosses lanes, so there is no way to model this shuffle.
1096510965
return false;
1096610966

10967-
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
10968-
// Adjust second vector indices to start at LaneSize instead of Size.
10969-
int LocalM =
10970-
Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
10967+
// Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10968+
// later vector indices to start at multiples of LaneSize instead of Size.
10969+
int LaneM = Mask[i] / Size;
10970+
int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
1097110971
if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
1097210972
// This is the first non-undef entry in this slot of a 128-bit lane.
1097310973
RepeatedMask[i % LaneSize] = LocalM;
@@ -36225,24 +36225,25 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
3622536225
int NumEltsPerLane = NumElts / NumLanes;
3622636226
int NumHalfEltsPerLane = NumEltsPerLane / 2;
3622736227
MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
36228+
unsigned EltSizeInBits = RootSizeInBits / Mask.size();
3622836229

36229-
// TODO: Add support for 256/512-bit vectors.
36230-
if (RootSizeInBits == 128 && NumEltsPerLane >= 4 &&
36230+
if (NumEltsPerLane >= 4 &&
3623136231
(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
36232-
SmallVector<int> ScaledMask;
36233-
if (scaleShuffleElements(Mask, 4, ScaledMask)) {
36232+
SmallVector<int> LaneMask, ScaledMask;
36233+
if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
36234+
scaleShuffleElements(LaneMask, 4, ScaledMask)) {
3623436235
// See if we can remove the shuffle by resorting the HOP chain so that
3623536236
// the HOP args are pre-shuffled.
3623636237
// TODO: Generalize to any sized/depth chain.
3623736238
// TODO: Add support for PACKSS/PACKUS.
36238-
if (isHoriz && NumEltsPerLane == 4) {
36239+
if (isHoriz) {
3623936240
// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
3624036241
auto GetHOpSrc = [&](int M) {
3624136242
if (M == SM_SentinelUndef)
3624236243
return DAG.getUNDEF(VT0);
3624336244
if (M == SM_SentinelZero)
3624436245
return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
36245-
SDValue Src0 = BC[M / NumElts];
36246+
SDValue Src0 = BC[M / 4];
3624636247
SDValue Src1 = Src0.getOperand((M % 4) >= 2);
3624736248
if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
3624836249
return Src1.getOperand(M % 2);
@@ -36253,8 +36254,8 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
3625336254
SDValue M2 = GetHOpSrc(ScaledMask[2]);
3625436255
SDValue M3 = GetHOpSrc(ScaledMask[3]);
3625536256
if (M0 && M1 && M2 && M3) {
36256-
SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1);
36257-
SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3);
36257+
SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
36258+
SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
3625836259
return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
3625936260
}
3626036261
}
@@ -36348,7 +36349,6 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
3634836349
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
3634936350
// single instruction. Attempt to match a v2X64 repeating shuffle pattern that
3635036351
// represents the LHS/RHS inputs for the lower/upper halves.
36351-
unsigned EltSizeInBits = RootSizeInBits / Mask.size();
3635236352
SmallVector<int, 16> TargetMask128, WideMask128;
3635336353
if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
3635436354
scaleShuffleElements(TargetMask128, 2, WideMask128)) {
@@ -37564,29 +37564,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
3756437564
}
3756537565
return SDValue();
3756637566
}
37567-
case X86ISD::UNPCKL:
37568-
case X86ISD::UNPCKH: {
37569-
// unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
37570-
// unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
37571-
// Don't fold if hop(x,y) == hop(z,w).
37572-
// TODO: Merge this into canonicalizeShuffleMaskWithHorizOp?
37573-
SDValue N0 = N.getOperand(0);
37574-
SDValue N1 = N.getOperand(1);
37575-
if (VT.getScalarSizeInBits() == 32 && N0 != N1 &&
37576-
N0.getOpcode() == N1.getOpcode() && isHorizOp(N0.getOpcode())) {
37577-
unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
37578-
SDValue Res = DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(LoHi),
37579-
N1.getOperand(LoHi));
37580-
// Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
37581-
// combining and domain handling will simplify this later on.
37582-
EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
37583-
Res = DAG.getBitcast(ShuffleVT, Res);
37584-
Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
37585-
getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
37586-
return DAG.getBitcast(VT, Res);
37587-
}
37588-
return SDValue();
37589-
}
3759037567
case X86ISD::VPERMI: {
3759137568
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
3759237569
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

llvm/test/CodeGen/X86/horizontal-shuffle.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -305,9 +305,8 @@ define <32 x i8> @test_unpackh_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x
305305
define <8 x float> @test_shufps_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
306306
; CHECK-LABEL: test_shufps_packss_256:
307307
; CHECK: ## %bb.0:
308-
; CHECK-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
309-
; CHECK-NEXT: vpackssdw %ymm3, %ymm0, %ymm1
310-
; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,2],ymm0[4,5],ymm1[6,6]
308+
; CHECK-NEXT: vpackssdw %ymm3, %ymm0, %ymm0
309+
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
311310
; CHECK-NEXT: ret{{[l|q]}}
312311
%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
313312
%2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
@@ -320,9 +319,8 @@ define <8 x float> @test_shufps_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
320319
define <8 x float> @test_shufps_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
321320
; CHECK-LABEL: test_shufps_packus_256:
322321
; CHECK: ## %bb.0:
323-
; CHECK-NEXT: vpackuswb %ymm0, %ymm0, %ymm0
324-
; CHECK-NEXT: vpackuswb %ymm0, %ymm2, %ymm1
325-
; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4]
322+
; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
323+
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
326324
; CHECK-NEXT: ret{{[l|q]}}
327325
%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
328326
%2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a2, <16 x i16> %a3)

0 commit comments

Comments
 (0)