Skip to content

Commit 2655bd5

Browse files
committed
[X86][SSE] combineShuffleWithHorizOp - canonicalize SHUFFLE(HOP(X,Y),HOP(Y,X)) -> SHUFFLE(HOP(X,Y))
Attempt to canonicalize binary shuffles of HOPs with commuted operands to an unary shuffle.
1 parent 8dd2eb1 commit 2655bd5

File tree

3 files changed

+29
-11
lines changed

3 files changed

+29
-11
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35886,12 +35886,40 @@ static SDValue combineShuffleWithHorizOp(SDValue N, MVT VT, const SDLoc &DL,
3588635886
if (!isHoriz && !isPack)
3588735887
return SDValue();
3588835888

35889-
// Canonicalize unary horizontal ops to only refer to lower halves.
3589035889
if (TargetMask.size() == VT0.getVectorNumElements()) {
3589135890
int NumElts = VT0.getVectorNumElements();
3589235891
int NumLanes = VT0.getSizeInBits() / 128;
3589335892
int NumEltsPerLane = NumElts / NumLanes;
3589435893
int NumHalfEltsPerLane = NumEltsPerLane / 2;
35894+
35895+
// Canonicalize binary shuffles of horizontal ops that use the
35896+
// same sources to an unary shuffle.
35897+
// TODO: Try to perform this fold even if the shuffle remains.
35898+
if (BC0 != BC1) {
35899+
auto ContainsOps = [](SDValue HOp, SDValue Op) {
35900+
return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
35901+
};
35902+
// Commute if all BC0's ops are contained in BC1.
35903+
if (ContainsOps(BC1, BC0.getOperand(0)) &&
35904+
ContainsOps(BC1, BC0.getOperand(1))) {
35905+
ShuffleVectorSDNode::commuteMask(TargetMask);
35906+
std::swap(BC0, BC1);
35907+
}
35908+
// If BC1 can be represented by BC0, then convert to unary shuffle.
35909+
if (ContainsOps(BC0, BC1.getOperand(0)) &&
35910+
ContainsOps(BC0, BC1.getOperand(1))) {
35911+
for (int &M : TargetMask) {
35912+
if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
35913+
continue;
35914+
int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
35915+
M -= NumElts + (SubLane * NumHalfEltsPerLane);
35916+
if (BC1.getOperand(SubLane) != BC0.getOperand(0))
35917+
M += NumHalfEltsPerLane;
35918+
}
35919+
}
35920+
}
35921+
35922+
// Canonicalize unary horizontal ops to only refer to lower halves.
3589535923
for (int i = 0; i != NumElts; ++i) {
3589635924
int &M = TargetMask[i];
3589735925
if (isUndefOrZero(M))

llvm/test/CodeGen/X86/haddsub-shuf.ll

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -910,8 +910,6 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
910910
; AVX1_FAST: # %bb.0:
911911
; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
912912
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
913-
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
914-
; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
915913
; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
916914
; AVX1_FAST-NEXT: retq
917915
;
@@ -929,8 +927,6 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
929927
; AVX2_FAST: # %bb.0:
930928
; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
931929
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
932-
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
933-
; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
934930
; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
935931
; AVX2_FAST-NEXT: retq
936932
%t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
@@ -972,8 +968,6 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
972968
; AVX1_FAST-LABEL: PR34724_2:
973969
; AVX1_FAST: # %bb.0:
974970
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
975-
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
976-
; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
977971
; AVX1_FAST-NEXT: retq
978972
;
979973
; AVX2_SLOW-LABEL: PR34724_2:
@@ -987,8 +981,6 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
987981
; AVX2_FAST-LABEL: PR34724_2:
988982
; AVX2_FAST: # %bb.0:
989983
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
990-
; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
991-
; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
992984
; AVX2_FAST-NEXT: retq
993985
%t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
994986
%t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>

llvm/test/CodeGen/X86/haddsub-undef.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -986,8 +986,6 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
986986
; AVX-FAST: # %bb.0:
987987
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
988988
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
989-
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
990-
; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
991989
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
992990
; AVX-FAST-NEXT: retq
993991
%3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>

0 commit comments

Comments
 (0)