-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] combineConcatVectorOps - concat(shuffle(x,y,m1),shuffle(x,y,m2)) -> shuffle(concat(x,x),concat(y,y),m3) on VBMI targets #130134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…) -> shuffle(concat(x,x),concat(y,y),m3) on VBMI targets With VBMI we are guaranteed to support cross-lane 256-bit shuffles, so subvector splats should always be cheap. Fixes llvm#116931
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesWith VBMI we are guaranteed to support cross-lane 256-bit shuffles, so subvector splats should always be cheap. Fixes #116931 Full diff: https://github.com/llvm/llvm-project/pull/130134.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index deab638b7e546..68dac675e0c20 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57948,9 +57948,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
switch (Op0.getOpcode()) {
case ISD::VECTOR_SHUFFLE: {
- if (NumOps == 2 && VT.is256BitVector() &&
+ // TODO: Relax VBMI requirement for repeated shuffle ops - currently
+ // limited to targets that should always have good cross lane shuffles.
+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
(EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
- (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
+ (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1) ||
+ (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
+ Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
+ Subtarget.hasVBMI()))) {
int NumSubElts = Op0.getValueType().getVectorNumElements();
SmallVector<int> NewMask;
for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index d830834a83ab4..a56b0a6351a3b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1566,11 +1566,12 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
+; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3
+; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %ymm3, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 399bdb7a248b8..f45405d885377 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1218,9 +1218,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VLVBMI2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d08bfc6e2d7ea..28d637a50109d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -5059,18 +5059,39 @@ define void @shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
-; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2OR512VL-NEXT: vmovdqa %xmm0, 16(%rdi)
-; AVX2OR512VL-NEXT: vmovdqa %xmm2, (%rdi)
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT: vmovdqa %xmm0, 16(%rdi)
+; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
+; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VLBW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512VLBW-NEXT: vmovdqa %xmm0, 16(%rdi)
+; AVX512VLBW-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,16,24,1,9,17,25,2,10,18,26,3,11,19,27,4,12,20,28,5,13,21,29,6,14,22,30,7,15,23,31]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: vmovdqa %ymm0, (%rdi)
+; AVX512VLVBMI-NEXT: vzeroupper
+; AVX512VLVBMI-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v32i8_store_00_08_16_24_01_09_17_25_02_10_18_26_03_11_19_27_04_12_20_28_05_13_21_29_06_14_22_30_07_15_23_31:
; XOPAVX1: # %bb.0:
|
(Ops[0].getOperand(0) == Ops[1].getOperand(0) && | ||
Ops[0].getOperand(1) == Ops[1].getOperand(1) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How do we know Ops[1]
is VECTOR_SHUFFLE?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This switch only gets called if all subvectors have the same opcode. Undefs are not allowed. This is to handle whenever we are concatenating subvectors all with the same instruction.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…) -> shuffle(concat(x,x),concat(y,y),m3) on VBMI targets (llvm#130134) With VBMI we are guaranteed to support cross-lane 256-bit shuffles, so subvector splats should always be cheap. Fixes llvm#116931
With VBMI we are guaranteed to support cross-lane 256-bit shuffles, so subvector splats should always be cheap.
Fixes #116931