Skip to content

Commit e71dd7c

Browse files
committed
[X86][SSE] getFauxShuffle - don't combine shuffles with small truncated scalars (PR45604)
getFauxShuffle attempts to combine INSERT_VECTOR_ELT(TRUNCATE/EXTEND(EXTRACT_VECTOR_ELT(x))) patterns into a target shuffle chain. PR45604 identified an issue where the scalar was truncated to a size smaller than the destination vector element and then zero extended back, which requires the upper bits to be zero'd which we don't currently do. To avoid the bug I've added an early out in these truncation cases, a future commit should allow us to handle this by inserting the necessary SM_SentinelZero padding.
1 parent 098e40e commit e71dd7c

File tree

5 files changed

+68
-124
lines changed

5 files changed

+68
-124
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7462,12 +7462,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
74627462
}
74637463

74647464
// Peek through trunc/aext/zext.
7465+
// TODO: handle elements smaller than VT.
74657466
// TODO: aext shouldn't require SM_SentinelZero padding.
74667467
// TODO: handle shift of scalars.
74677468
while (Scl.getOpcode() == ISD::TRUNCATE ||
74687469
Scl.getOpcode() == ISD::ANY_EXTEND ||
7469-
Scl.getOpcode() == ISD::ZERO_EXTEND)
7470+
Scl.getOpcode() == ISD::ZERO_EXTEND) {
74707471
Scl = Scl.getOperand(0);
7472+
if (Scl.getScalarValueSizeInBits() < NumBitsPerElt)
7473+
return false;
7474+
}
74717475

74727476
// Attempt to find the source vector the scalar was extracted from.
74737477
SDValue SrcExtract;

llvm/test/CodeGen/X86/buildvec-extract.ll

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -293,19 +293,24 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) {
293293
define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) {
294294
; SSE2-LABEL: extract2_i32_zext_insert1_i64_zero:
295295
; SSE2: # %bb.0:
296-
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
296+
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
297+
; SSE2-NEXT: movd %xmm0, %eax
298+
; SSE2-NEXT: movq %rax, %xmm0
299+
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
297300
; SSE2-NEXT: retq
298301
;
299302
; SSE41-LABEL: extract2_i32_zext_insert1_i64_zero:
300303
; SSE41: # %bb.0:
301-
; SSE41-NEXT: xorps %xmm1, %xmm1
302-
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
304+
; SSE41-NEXT: extractps $2, %xmm0, %eax
305+
; SSE41-NEXT: movq %rax, %xmm0
306+
; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
303307
; SSE41-NEXT: retq
304308
;
305309
; AVX-LABEL: extract2_i32_zext_insert1_i64_zero:
306310
; AVX: # %bb.0:
307-
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
308-
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
311+
; AVX-NEXT: vextractps $2, %xmm0, %eax
312+
; AVX-NEXT: vmovq %rax, %xmm0
313+
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
309314
; AVX-NEXT: retq
310315
%e = extractelement <4 x i32> %x, i32 2
311316
%z = zext i32 %e to i64
@@ -381,22 +386,16 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
381386
}
382387

383388
define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
384-
; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
385-
; SSE2: # %bb.0:
386-
; SSE2-NEXT: pextrw $0, %xmm0, %eax
387-
; SSE2-NEXT: movd %eax, %xmm0
388-
; SSE2-NEXT: retq
389-
;
390-
; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
391-
; SSE41: # %bb.0:
392-
; SSE41-NEXT: pxor %xmm1, %xmm1
393-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
394-
; SSE41-NEXT: retq
389+
; SSE-LABEL: extract0_i16_zext_insert0_i64_zero:
390+
; SSE: # %bb.0:
391+
; SSE-NEXT: pextrw $0, %xmm0, %eax
392+
; SSE-NEXT: movd %eax, %xmm0
393+
; SSE-NEXT: retq
395394
;
396395
; AVX-LABEL: extract0_i16_zext_insert0_i64_zero:
397396
; AVX: # %bb.0:
398-
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
399-
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
397+
; AVX-NEXT: vpextrw $0, %xmm0, %eax
398+
; AVX-NEXT: vmovd %eax, %xmm0
400399
; AVX-NEXT: retq
401400
%e = extractelement <8 x i16> %x, i32 0
402401
%z = zext i16 %e to i64

llvm/test/CodeGen/X86/buildvec-insertvec.ll

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
2121
; SSE41-LABEL: foo:
2222
; SSE41: # %bb.0:
2323
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
24-
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,3,u,u,u,u,u,u,u,u,u,u,u,u]
24+
; SSE41-NEXT: pextrb $8, %xmm0, %eax
25+
; SSE41-NEXT: pextrb $4, %xmm0, %ecx
26+
; SSE41-NEXT: pinsrb $1, %ecx, %xmm0
27+
; SSE41-NEXT: pinsrb $2, %eax, %xmm0
2528
; SSE41-NEXT: movl $255, %eax
2629
; SSE41-NEXT: pinsrb $3, %eax, %xmm0
2730
; SSE41-NEXT: movd %xmm0, (%rdi)

llvm/test/CodeGen/X86/extract-concat.ll

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
88
; SSE42-LABEL: foo:
99
; SSE42: # %bb.0:
1010
; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
11-
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,3,u,u,u,u,u,u,u,u,u,u,u,u]
11+
; SSE42-NEXT: pextrb $8, %xmm0, %eax
12+
; SSE42-NEXT: pextrb $4, %xmm0, %ecx
13+
; SSE42-NEXT: pinsrb $1, %ecx, %xmm0
14+
; SSE42-NEXT: pinsrb $2, %eax, %xmm0
1215
; SSE42-NEXT: movl $255, %eax
1316
; SSE42-NEXT: pinsrb $3, %eax, %xmm0
1417
; SSE42-NEXT: movd %xmm0, (%rdi)
@@ -17,7 +20,10 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
1720
; AVX-LABEL: foo:
1821
; AVX: # %bb.0:
1922
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
20-
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,3,u,u,u,u,u,u,u,u,u,u,u,u]
23+
; AVX-NEXT: vpextrb $8, %xmm0, %eax
24+
; AVX-NEXT: vpextrb $4, %xmm0, %ecx
25+
; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
26+
; AVX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
2127
; AVX-NEXT: movl $255, %eax
2228
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
2329
; AVX-NEXT: vmovd %xmm0, (%rdi)

llvm/test/CodeGen/X86/vector-shuffle-combining.ll

Lines changed: 34 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -3027,109 +3027,41 @@ define void @PR43024() {
30273027
ret void
30283028
}
30293029

3030-
; TODO - we're ignoring the i32->i16->i32 'ZERO_EXTEND_INREG' pattern, resulting in an bad movss .
30313030
define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
3032-
; SSE2-LABEL: PR45604:
3033-
; SSE2: # %bb.0:
3034-
; SSE2-NEXT: movdqa (%rsi), %xmm1
3035-
; SSE2-NEXT: pextrw $2, %xmm1, %eax
3036-
; SSE2-NEXT: movd %eax, %xmm0
3037-
; SSE2-NEXT: movl $11, %eax
3038-
; SSE2-NEXT: pinsrw $2, %eax, %xmm0
3039-
; SSE2-NEXT: pextrw $3, %xmm1, %ecx
3040-
; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
3041-
; SSE2-NEXT: pinsrw $6, %eax, %xmm0
3042-
; SSE2-NEXT: pextrw $4, %xmm1, %ecx
3043-
; SSE2-NEXT: movd %ecx, %xmm2
3044-
; SSE2-NEXT: pinsrw $2, %eax, %xmm2
3045-
; SSE2-NEXT: pextrw $5, %xmm1, %ecx
3046-
; SSE2-NEXT: pinsrw $4, %ecx, %xmm2
3047-
; SSE2-NEXT: pinsrw $6, %eax, %xmm2
3048-
; SSE2-NEXT: pextrw $6, %xmm1, %ecx
3049-
; SSE2-NEXT: movd %ecx, %xmm3
3050-
; SSE2-NEXT: pinsrw $2, %eax, %xmm3
3051-
; SSE2-NEXT: pextrw $7, %xmm1, %ecx
3052-
; SSE2-NEXT: pinsrw $4, %ecx, %xmm3
3053-
; SSE2-NEXT: pinsrw $6, %eax, %xmm3
3054-
; SSE2-NEXT: xorps %xmm4, %xmm4
3055-
; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
3056-
; SSE2-NEXT: pinsrw $2, %eax, %xmm4
3057-
; SSE2-NEXT: pextrw $1, %xmm1, %ecx
3058-
; SSE2-NEXT: pinsrw $4, %ecx, %xmm4
3059-
; SSE2-NEXT: pinsrw $6, %eax, %xmm4
3060-
; SSE2-NEXT: movdqa %xmm4, (%rdi)
3061-
; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
3062-
; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
3063-
; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
3064-
; SSE2-NEXT: retq
3065-
;
3066-
; SSSE3-LABEL: PR45604:
3067-
; SSSE3: # %bb.0:
3068-
; SSSE3-NEXT: movdqa (%rsi), %xmm1
3069-
; SSSE3-NEXT: pextrw $2, %xmm1, %eax
3070-
; SSSE3-NEXT: movd %eax, %xmm0
3071-
; SSSE3-NEXT: movl $11, %eax
3072-
; SSSE3-NEXT: pinsrw $2, %eax, %xmm0
3073-
; SSSE3-NEXT: pextrw $3, %xmm1, %ecx
3074-
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
3075-
; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
3076-
; SSSE3-NEXT: pextrw $4, %xmm1, %ecx
3077-
; SSSE3-NEXT: movd %ecx, %xmm2
3078-
; SSSE3-NEXT: pinsrw $2, %eax, %xmm2
3079-
; SSSE3-NEXT: pextrw $5, %xmm1, %ecx
3080-
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2
3081-
; SSSE3-NEXT: pinsrw $6, %eax, %xmm2
3082-
; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
3083-
; SSSE3-NEXT: movd %ecx, %xmm3
3084-
; SSSE3-NEXT: pinsrw $2, %eax, %xmm3
3085-
; SSSE3-NEXT: pextrw $7, %xmm1, %ecx
3086-
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3
3087-
; SSSE3-NEXT: pinsrw $6, %eax, %xmm3
3088-
; SSSE3-NEXT: xorps %xmm4, %xmm4
3089-
; SSSE3-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
3090-
; SSSE3-NEXT: pinsrw $2, %eax, %xmm4
3091-
; SSSE3-NEXT: pextrw $1, %xmm1, %ecx
3092-
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4
3093-
; SSSE3-NEXT: pinsrw $6, %eax, %xmm4
3094-
; SSSE3-NEXT: movdqa %xmm4, (%rdi)
3095-
; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
3096-
; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
3097-
; SSSE3-NEXT: movdqa %xmm0, 16(%rdi)
3098-
; SSSE3-NEXT: retq
3099-
;
3100-
; SSE41-LABEL: PR45604:
3101-
; SSE41: # %bb.0:
3102-
; SSE41-NEXT: movdqa (%rsi), %xmm1
3103-
; SSE41-NEXT: pextrw $2, %xmm1, %eax
3104-
; SSE41-NEXT: movd %eax, %xmm0
3105-
; SSE41-NEXT: movl $11, %eax
3106-
; SSE41-NEXT: pinsrw $2, %eax, %xmm0
3107-
; SSE41-NEXT: pextrw $3, %xmm1, %ecx
3108-
; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
3109-
; SSE41-NEXT: pinsrw $6, %eax, %xmm0
3110-
; SSE41-NEXT: pextrw $4, %xmm1, %ecx
3111-
; SSE41-NEXT: movd %ecx, %xmm2
3112-
; SSE41-NEXT: pinsrw $2, %eax, %xmm2
3113-
; SSE41-NEXT: pextrw $5, %xmm1, %ecx
3114-
; SSE41-NEXT: pinsrw $4, %ecx, %xmm2
3115-
; SSE41-NEXT: pinsrw $6, %eax, %xmm2
3116-
; SSE41-NEXT: pextrw $6, %xmm1, %ecx
3117-
; SSE41-NEXT: movd %ecx, %xmm3
3118-
; SSE41-NEXT: pinsrw $2, %eax, %xmm3
3119-
; SSE41-NEXT: pextrw $7, %xmm1, %ecx
3120-
; SSE41-NEXT: pinsrw $4, %ecx, %xmm3
3121-
; SSE41-NEXT: pinsrw $6, %eax, %xmm3
3122-
; SSE41-NEXT: pxor %xmm4, %xmm4
3123-
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3,4,5,6,7]
3124-
; SSE41-NEXT: pinsrw $2, %eax, %xmm4
3125-
; SSE41-NEXT: pextrw $1, %xmm1, %ecx
3126-
; SSE41-NEXT: pinsrw $4, %ecx, %xmm4
3127-
; SSE41-NEXT: pinsrw $6, %eax, %xmm4
3128-
; SSE41-NEXT: movdqa %xmm4, (%rdi)
3129-
; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
3130-
; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
3131-
; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
3132-
; SSE41-NEXT: retq
3031+
; SSE-LABEL: PR45604:
3032+
; SSE: # %bb.0:
3033+
; SSE-NEXT: movdqa (%rsi), %xmm1
3034+
; SSE-NEXT: movd %xmm1, %eax
3035+
; SSE-NEXT: movzwl %ax, %eax
3036+
; SSE-NEXT: movd %eax, %xmm0
3037+
; SSE-NEXT: movl $11, %eax
3038+
; SSE-NEXT: pinsrw $2, %eax, %xmm0
3039+
; SSE-NEXT: pextrw $1, %xmm1, %ecx
3040+
; SSE-NEXT: pinsrw $4, %ecx, %xmm0
3041+
; SSE-NEXT: pinsrw $6, %eax, %xmm0
3042+
; SSE-NEXT: pextrw $2, %xmm1, %ecx
3043+
; SSE-NEXT: movd %ecx, %xmm2
3044+
; SSE-NEXT: pinsrw $2, %eax, %xmm2
3045+
; SSE-NEXT: pextrw $3, %xmm1, %ecx
3046+
; SSE-NEXT: pinsrw $4, %ecx, %xmm2
3047+
; SSE-NEXT: pinsrw $6, %eax, %xmm2
3048+
; SSE-NEXT: pextrw $4, %xmm1, %ecx
3049+
; SSE-NEXT: movd %ecx, %xmm3
3050+
; SSE-NEXT: pinsrw $2, %eax, %xmm3
3051+
; SSE-NEXT: pextrw $5, %xmm1, %ecx
3052+
; SSE-NEXT: pinsrw $4, %ecx, %xmm3
3053+
; SSE-NEXT: pinsrw $6, %eax, %xmm3
3054+
; SSE-NEXT: pextrw $6, %xmm1, %ecx
3055+
; SSE-NEXT: movd %ecx, %xmm4
3056+
; SSE-NEXT: pinsrw $2, %eax, %xmm4
3057+
; SSE-NEXT: pextrw $7, %xmm1, %ecx
3058+
; SSE-NEXT: pinsrw $4, %ecx, %xmm4
3059+
; SSE-NEXT: pinsrw $6, %eax, %xmm4
3060+
; SSE-NEXT: movdqa %xmm4, 48(%rdi)
3061+
; SSE-NEXT: movdqa %xmm3, 32(%rdi)
3062+
; SSE-NEXT: movdqa %xmm2, 16(%rdi)
3063+
; SSE-NEXT: movdqa %xmm0, (%rdi)
3064+
; SSE-NEXT: retq
31333065
;
31343066
; AVX1-LABEL: PR45604:
31353067
; AVX1: # %bb.0:

0 commit comments

Comments
 (0)