Skip to content

Commit 19d0284

Browse files
committed
[X86][AVX] Fold extract_subvector(VSRLI/VSHLI(x,32)) -> VSRLI/VSHLI(extract_subvector(x),32)
As discussed on D56387, if we're shifting to extract the upper/lower half of a vXi64 vector then we're actually better off performing this at the subvector level as its very likely to fold into something. combineConcatVectorOps can perform this in reverse if necessary.
1 parent c056f82 commit 19d0284

File tree

4 files changed

+24
-14
lines changed

4 files changed

+24
-14
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49799,8 +49799,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
4979949799

4980049800
// If we're extracting the lowest subvector and we're the only user,
4980149801
// we may be able to perform this with a smaller vector width.
49802+
unsigned InOpcode = InVec.getOpcode();
4980249803
if (IdxVal == 0 && InVec.hasOneUse()) {
49803-
unsigned InOpcode = InVec.getOpcode();
4980449804
if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
4980549805
// v2f64 CVTDQ2PD(v4i32).
4980649806
if (InOpcode == ISD::SINT_TO_FP &&
@@ -49853,6 +49853,17 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
4985349853
}
4985449854
}
4985549855

49856+
// Always split vXi64 logical shifts where we're extracting the upper 32-bits
49857+
// as this is very likely to fold into a shuffle/truncation.
49858+
if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
49859+
InVecVT.getScalarSizeInBits() == 64 &&
49860+
InVec.getConstantOperandAPInt(1) == 32) {
49861+
SDLoc DL(N);
49862+
SDValue Ext =
49863+
extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
49864+
return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
49865+
}
49866+
4985649867
return SDValue();
4985749868
}
4985849869

llvm/test/CodeGen/X86/combine-sra.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,9 +207,8 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
207207
;
208208
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
209209
; AVX2-SLOW: # %bb.0:
210-
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
211-
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
212-
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
210+
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
211+
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
213212
; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
214213
; AVX2-SLOW-NEXT: vzeroupper
215214
; AVX2-SLOW-NEXT: retq

llvm/test/CodeGen/X86/pmul.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1150,9 +1150,8 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
11501150
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
11511151
; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
11521152
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1153-
; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
11541153
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
1155-
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
1154+
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
11561155
; AVX-NEXT: vzeroupper
11571156
; AVX-NEXT: retq
11581157
entry:

llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -834,19 +834,20 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
834834
;
835835
; AVX2-64-LABEL: uitofp_v4i64_v4f64:
836836
; AVX2-64: # %bb.0:
837-
; AVX2-64-NEXT: vpsrlq $32, %ymm0, %ymm1
838-
; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2
837+
; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
838+
; AVX2-64-NEXT: vpsrlq $32, %xmm1, %xmm1
839+
; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax
840+
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
841+
; AVX2-64-NEXT: vmovq %xmm1, %rax
842+
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
843+
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
844+
; AVX2-64-NEXT: vpsrlq $32, %xmm0, %xmm2
839845
; AVX2-64-NEXT: vpextrq $1, %xmm2, %rax
840846
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
841847
; AVX2-64-NEXT: vmovq %xmm2, %rax
842848
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
843849
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
844-
; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax
845-
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
846-
; AVX2-64-NEXT: vmovq %xmm1, %rax
847-
; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm1
848-
; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
849-
; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
850+
; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
850851
; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9]
851852
; AVX2-64-NEXT: vmulpd %ymm2, %ymm1, %ymm1
852853
; AVX2-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2

0 commit comments

Comments
 (0)