Skip to content

Commit 5867b13

Browse files
committed
[WIP][X86] combineX86ShufflesRecursively - attempt to combine shuffles with larger types from EXTRACT_SUBVECTOR nodes
This replaces the rather limited combineX86ShuffleChainWithExtract function with handling for EXTRACT_SUBVECTOR node as we recurse down the shuffle chain, widening the shuffle mask to accommodate the larger value type. This will mainly help AVX2/AVX512 cases with cross-lane shuffles, but it also helps collapse some cases where the same subvector has gotten reused in multiple lanes. Exposed missing DemandedElts handling inside ISD::TRUNCATE nodes for ComputeNumSignBits
1 parent 3dc9f2d commit 5867b13

File tree

95 files changed

+49461
-48366
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+49461
-48366
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5109,7 +5109,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
51095109
case ISD::TRUNCATE: {
51105110
// Check if the sign bits of source go down as far as the truncated value.
51115111
unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
5112-
unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5112+
unsigned NumSrcSignBits =
5113+
ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
51135114
if (NumSrcSignBits > (NumSrcBits - VTBits))
51145115
return NumSrcSignBits - (NumSrcBits - VTBits);
51155116
break;

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 88 additions & 223 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 179 additions & 253 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 22 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -641,25 +641,16 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
641641
; AVX512DQ-NEXT: vzeroupper
642642
; AVX512DQ-NEXT: retq
643643
;
644-
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
645-
; AVX512BW-SLOW: # %bb.0:
646-
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
647-
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
648-
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
649-
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
650-
; AVX512BW-SLOW-NEXT: vzeroupper
651-
; AVX512BW-SLOW-NEXT: retq
652-
;
653-
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
654-
; AVX512BW-FAST: # %bb.0:
655-
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
656-
; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
657-
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
658-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
659-
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
660-
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
661-
; AVX512BW-FAST-NEXT: vzeroupper
662-
; AVX512BW-FAST-NEXT: retq
644+
; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
645+
; AVX512BW: # %bb.0:
646+
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [16,25,16,27,16,29,0,23]
647+
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
648+
; AVX512BW-NEXT: vpermt2w (%rdi), %ymm0, %ymm1
649+
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],mem[7]
650+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
651+
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
652+
; AVX512BW-NEXT: vzeroupper
653+
; AVX512BW-NEXT: retq
663654
%in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
664655
%in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
665656
%broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
@@ -735,25 +726,18 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
735726
; AVX512DQ-NEXT: vzeroupper
736727
; AVX512DQ-NEXT: retq
737728
;
738-
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
739-
; AVX512BW-SLOW: # %bb.0:
740-
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
741-
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
742-
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
743-
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
744-
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
745-
; AVX512BW-SLOW-NEXT: vzeroupper
746-
; AVX512BW-SLOW-NEXT: retq
747-
;
748-
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
749-
; AVX512BW-FAST: # %bb.0:
750-
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
751-
; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
752-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
753-
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
754-
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
755-
; AVX512BW-FAST-NEXT: vzeroupper
756-
; AVX512BW-FAST-NEXT: retq
729+
; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
730+
; AVX512BW: # %bb.0:
731+
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
732+
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
733+
; AVX512BW-NEXT: vmovd %xmm0, %eax
734+
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
735+
; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
736+
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
737+
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
738+
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
739+
; AVX512BW-NEXT: vzeroupper
740+
; AVX512BW-NEXT: retq
757741
%in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
758742
%in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
759743
%broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>

llvm/test/CodeGen/X86/avx512-cvt.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,11 @@ define <4 x i64> @f64to4sl(<4 x double> %a) {
179179
; NODQ: # %bb.0:
180180
; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1
181181
; NODQ-NEXT: vcvttsd2si %xmm1, %rax
182-
; NODQ-NEXT: vmovq %rax, %xmm2
183-
; NODQ-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
184-
; NODQ-NEXT: vcvttsd2si %xmm1, %rax
185182
; NODQ-NEXT: vmovq %rax, %xmm1
186-
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
183+
; NODQ-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
184+
; NODQ-NEXT: vcvttsd2si %xmm2, %rax
185+
; NODQ-NEXT: vmovq %rax, %xmm2
186+
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
187187
; NODQ-NEXT: vcvttsd2si %xmm0, %rax
188188
; NODQ-NEXT: vmovq %rax, %xmm2
189189
; NODQ-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]

llvm/test/CodeGen/X86/avx512-hadd-hsub.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,10 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
221221
;
222222
; SKX-LABEL: fsub_noundef_ee:
223223
; SKX: # %bb.0:
224-
; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm0
225-
; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
226-
; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
224+
; SKX-NEXT: vmovapd {{.*#+}} xmm0 = [5,4]
225+
; SKX-NEXT: vpermpd %zmm1, %zmm0, %zmm0
226+
; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm1
227+
; SKX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
227228
; SKX-NEXT: vzeroupper
228229
; SKX-NEXT: retq
229230
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>

0 commit comments

Comments
 (0)