Skip to content

Commit 5211af4

Browse files
committed
[X86][AVX] combineExtractWithShuffle - combine extracts from 256/512-bit vector shuffles.
We can only legally extract from the lowest 128-bit subvector, so extract the correct subvector to allow us to handle 256/512-bit vector element extracts.
1 parent 6ab792b commit 5211af4

File tree

3 files changed

+68
-63
lines changed

3 files changed

+68
-63
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40277,10 +40277,21 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
4027740277

4027840278
// We can only legally extract other elements from 128-bit vectors and in
4027940279
// certain circumstances, depending on SSE-level.
40280-
// TODO: Investigate using extract_subvector for larger vectors.
4028140280
// TODO: Investigate float/double extraction if it will be just stored.
4028240281
auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
4028340282
unsigned Idx) {
40283+
EVT VecSVT = VecVT.getScalarType();
40284+
if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
40285+
(VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
40286+
VecSVT == MVT::i64)) {
40287+
unsigned EltSizeInBits = VecSVT.getSizeInBits();
40288+
unsigned NumEltsPerLane = 128 / EltSizeInBits;
40289+
unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
40290+
unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
40291+
VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
40292+
Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
40293+
Idx &= (NumEltsPerLane - 1);
40294+
}
4028440295
if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
4028540296
((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
4028640297
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),

llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll

Lines changed: 37 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -800,34 +800,32 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
800800
;
801801
; AVX1-64-LABEL: uitofp_v4i64_v4f64:
802802
; AVX1-64: # %bb.0:
803-
; AVX1-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
804-
; AVX1-64-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
805-
; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm2
806-
; AVX1-64-NEXT: vpextrq $1, %xmm2, %rax
803+
; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm1
804+
; AVX1-64-NEXT: vpextrd $2, %xmm1, %eax
805+
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
806+
; AVX1-64-NEXT: vmovd %xmm1, %eax
807807
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
808-
; AVX1-64-NEXT: vmovq %xmm2, %rax
809-
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
810-
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
811-
; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax
808+
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0]
809+
; AVX1-64-NEXT: vextractps $2, %xmm0, %eax
812810
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
813-
; AVX1-64-NEXT: vmovq %xmm1, %rax
814-
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
811+
; AVX1-64-NEXT: vmovq %xmm0, %rax
812+
; AVX1-64-NEXT: movl %eax, %eax
813+
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4
814+
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0]
815+
; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
816+
; AVX1-64-NEXT: vpextrd $3, %xmm1, %eax
817+
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
818+
; AVX1-64-NEXT: vpextrd $1, %xmm1, %eax
819+
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1
815820
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
816-
; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
817-
; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2
818-
; AVX1-64-NEXT: vextractps $3, %xmm2, %eax
819-
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
820-
; AVX1-64-NEXT: vextractps $1, %xmm2, %eax
821-
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
822-
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
823821
; AVX1-64-NEXT: vpextrd $3, %xmm0, %eax
824-
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
822+
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
825823
; AVX1-64-NEXT: vpextrd $1, %xmm0, %eax
826-
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
824+
; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0
827825
; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
828-
; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
826+
; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
829827
; AVX1-64-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
830-
; AVX1-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
828+
; AVX1-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
831829
; AVX1-64-NEXT: retq
832830
;
833831
; AVX2-64-LABEL: uitofp_v4i64_v4f64:
@@ -836,31 +834,29 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
836834
; AVX2-64-NEXT: vextractps $3, %xmm1, %eax
837835
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
838836
; AVX2-64-NEXT: vextractps $1, %xmm1, %eax
839-
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
840-
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
841-
; AVX2-64-NEXT: vextractps $3, %xmm0, %eax
842-
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
843-
; AVX2-64-NEXT: vextractps $1, %xmm0, %eax
844837
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
845838
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0]
846-
; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
847-
; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9]
848-
; AVX2-64-NEXT: vmulpd %ymm2, %ymm1, %ymm1
849-
; AVX2-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
850-
; AVX2-64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
851-
; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm2
852-
; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax
853-
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
854-
; AVX2-64-NEXT: vmovq %xmm2, %rax
855-
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
856-
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
857-
; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax
839+
; AVX2-64-NEXT: vextractps $3, %xmm0, %eax
858840
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
841+
; AVX2-64-NEXT: vextractps $1, %xmm0, %eax
842+
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4
843+
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0]
844+
; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
845+
; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm3 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9]
846+
; AVX2-64-NEXT: vmulpd %ymm3, %ymm2, %ymm2
847+
; AVX2-64-NEXT: vextractps $2, %xmm1, %eax
848+
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
849+
; AVX2-64-NEXT: vmovd %xmm1, %eax
850+
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1
851+
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
852+
; AVX2-64-NEXT: vextractps $2, %xmm0, %eax
853+
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
859854
; AVX2-64-NEXT: vmovq %xmm0, %rax
860-
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
855+
; AVX2-64-NEXT: movl %eax, %eax
856+
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0
861857
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
862-
; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
863-
; AVX2-64-NEXT: vaddpd %ymm0, %ymm1, %ymm0
858+
; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
859+
; AVX2-64-NEXT: vaddpd %ymm0, %ymm2, %ymm0
864860
; AVX2-64-NEXT: retq
865861
;
866862
; AVX512F-64-LABEL: uitofp_v4i64_v4f64:

llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7541,34 +7541,32 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 {
75417541
;
75427542
; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64:
75437543
; AVX1: # %bb.0: # %entry
7544-
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
7545-
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
7546-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
7547-
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
7544+
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
7545+
; AVX1-NEXT: vpextrd $2, %xmm1, %eax
7546+
; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
7547+
; AVX1-NEXT: vmovd %xmm1, %eax
75487548
; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
7549-
; AVX1-NEXT: vmovq %xmm2, %rax
7550-
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
7551-
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
7552-
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
7549+
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0]
7550+
; AVX1-NEXT: vextractps $2, %xmm0, %eax
75537551
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
7554-
; AVX1-NEXT: vmovq %xmm1, %rax
7555-
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
7552+
; AVX1-NEXT: vmovq %xmm0, %rax
7553+
; AVX1-NEXT: movl %eax, %eax
7554+
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4
7555+
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0]
7556+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
7557+
; AVX1-NEXT: vpextrd $3, %xmm1, %eax
7558+
; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
7559+
; AVX1-NEXT: vpextrd $1, %xmm1, %eax
7560+
; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1
75567561
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
7557-
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
7558-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
7559-
; AVX1-NEXT: vextractps $3, %xmm2, %eax
7560-
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
7561-
; AVX1-NEXT: vextractps $1, %xmm2, %eax
7562-
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
7563-
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
75647562
; AVX1-NEXT: vpextrd $3, %xmm0, %eax
7565-
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
7563+
; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
75667564
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
7567-
; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
7565+
; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0
75687566
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
7569-
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
7567+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
75707568
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
7571-
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7569+
; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0
75727570
; AVX1-NEXT: retq
75737571
;
75747572
; AVX512F-LABEL: constrained_vector_uitofp_v4f64_v4i64:

0 commit comments

Comments
 (0)