Skip to content

Commit d6db919

Browse files
committed
[X86][SSE] Add test case for PR45604
1 parent 46de0d5 commit d6db919

File tree

1 file changed

+184
-0
lines changed

1 file changed

+184
-0
lines changed

llvm/test/CodeGen/X86/vector-shuffle-combining.ll

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3026,3 +3026,187 @@ define void @PR43024() {
30263026
store float %8, float* undef, align 8
30273027
ret void
30283028
}
3029+
3030+
; TODO - we're ignoring the i32->i16->i32 'ZERO_EXTEND_INREG' pattern, resulting in an bad movss .
3031+
define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
3032+
; SSE2-LABEL: PR45604:
3033+
; SSE2: # %bb.0:
3034+
; SSE2-NEXT: movdqa (%rsi), %xmm1
3035+
; SSE2-NEXT: pextrw $2, %xmm1, %eax
3036+
; SSE2-NEXT: movd %eax, %xmm0
3037+
; SSE2-NEXT: movl $11, %eax
3038+
; SSE2-NEXT: pinsrw $2, %eax, %xmm0
3039+
; SSE2-NEXT: pextrw $3, %xmm1, %ecx
3040+
; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
3041+
; SSE2-NEXT: pinsrw $6, %eax, %xmm0
3042+
; SSE2-NEXT: pextrw $4, %xmm1, %ecx
3043+
; SSE2-NEXT: movd %ecx, %xmm2
3044+
; SSE2-NEXT: pinsrw $2, %eax, %xmm2
3045+
; SSE2-NEXT: pextrw $5, %xmm1, %ecx
3046+
; SSE2-NEXT: pinsrw $4, %ecx, %xmm2
3047+
; SSE2-NEXT: pinsrw $6, %eax, %xmm2
3048+
; SSE2-NEXT: pextrw $6, %xmm1, %ecx
3049+
; SSE2-NEXT: movd %ecx, %xmm3
3050+
; SSE2-NEXT: pinsrw $2, %eax, %xmm3
3051+
; SSE2-NEXT: pextrw $7, %xmm1, %ecx
3052+
; SSE2-NEXT: pinsrw $4, %ecx, %xmm3
3053+
; SSE2-NEXT: pinsrw $6, %eax, %xmm3
3054+
; SSE2-NEXT: xorps %xmm4, %xmm4
3055+
; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
3056+
; SSE2-NEXT: pinsrw $2, %eax, %xmm4
3057+
; SSE2-NEXT: pextrw $1, %xmm1, %ecx
3058+
; SSE2-NEXT: pinsrw $4, %ecx, %xmm4
3059+
; SSE2-NEXT: pinsrw $6, %eax, %xmm4
3060+
; SSE2-NEXT: movdqa %xmm4, (%rdi)
3061+
; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
3062+
; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
3063+
; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
3064+
; SSE2-NEXT: retq
3065+
;
3066+
; SSSE3-LABEL: PR45604:
3067+
; SSSE3: # %bb.0:
3068+
; SSSE3-NEXT: movdqa (%rsi), %xmm1
3069+
; SSSE3-NEXT: pextrw $2, %xmm1, %eax
3070+
; SSSE3-NEXT: movd %eax, %xmm0
3071+
; SSSE3-NEXT: movl $11, %eax
3072+
; SSSE3-NEXT: pinsrw $2, %eax, %xmm0
3073+
; SSSE3-NEXT: pextrw $3, %xmm1, %ecx
3074+
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
3075+
; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
3076+
; SSSE3-NEXT: pextrw $4, %xmm1, %ecx
3077+
; SSSE3-NEXT: movd %ecx, %xmm2
3078+
; SSSE3-NEXT: pinsrw $2, %eax, %xmm2
3079+
; SSSE3-NEXT: pextrw $5, %xmm1, %ecx
3080+
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2
3081+
; SSSE3-NEXT: pinsrw $6, %eax, %xmm2
3082+
; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
3083+
; SSSE3-NEXT: movd %ecx, %xmm3
3084+
; SSSE3-NEXT: pinsrw $2, %eax, %xmm3
3085+
; SSSE3-NEXT: pextrw $7, %xmm1, %ecx
3086+
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3
3087+
; SSSE3-NEXT: pinsrw $6, %eax, %xmm3
3088+
; SSSE3-NEXT: xorps %xmm4, %xmm4
3089+
; SSSE3-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
3090+
; SSSE3-NEXT: pinsrw $2, %eax, %xmm4
3091+
; SSSE3-NEXT: pextrw $1, %xmm1, %ecx
3092+
; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4
3093+
; SSSE3-NEXT: pinsrw $6, %eax, %xmm4
3094+
; SSSE3-NEXT: movdqa %xmm4, (%rdi)
3095+
; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
3096+
; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
3097+
; SSSE3-NEXT: movdqa %xmm0, 16(%rdi)
3098+
; SSSE3-NEXT: retq
3099+
;
3100+
; SSE41-LABEL: PR45604:
3101+
; SSE41: # %bb.0:
3102+
; SSE41-NEXT: movdqa (%rsi), %xmm1
3103+
; SSE41-NEXT: pextrw $2, %xmm1, %eax
3104+
; SSE41-NEXT: movd %eax, %xmm0
3105+
; SSE41-NEXT: movl $11, %eax
3106+
; SSE41-NEXT: pinsrw $2, %eax, %xmm0
3107+
; SSE41-NEXT: pextrw $3, %xmm1, %ecx
3108+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
3109+
; SSE41-NEXT: pinsrw $6, %eax, %xmm0
3110+
; SSE41-NEXT: pextrw $4, %xmm1, %ecx
3111+
; SSE41-NEXT: movd %ecx, %xmm2
3112+
; SSE41-NEXT: pinsrw $2, %eax, %xmm2
3113+
; SSE41-NEXT: pextrw $5, %xmm1, %ecx
3114+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm2
3115+
; SSE41-NEXT: pinsrw $6, %eax, %xmm2
3116+
; SSE41-NEXT: pextrw $6, %xmm1, %ecx
3117+
; SSE41-NEXT: movd %ecx, %xmm3
3118+
; SSE41-NEXT: pinsrw $2, %eax, %xmm3
3119+
; SSE41-NEXT: pextrw $7, %xmm1, %ecx
3120+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm3
3121+
; SSE41-NEXT: pinsrw $6, %eax, %xmm3
3122+
; SSE41-NEXT: pxor %xmm4, %xmm4
3123+
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3,4,5,6,7]
3124+
; SSE41-NEXT: pinsrw $2, %eax, %xmm4
3125+
; SSE41-NEXT: pextrw $1, %xmm1, %ecx
3126+
; SSE41-NEXT: pinsrw $4, %ecx, %xmm4
3127+
; SSE41-NEXT: pinsrw $6, %eax, %xmm4
3128+
; SSE41-NEXT: movdqa %xmm4, (%rdi)
3129+
; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
3130+
; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
3131+
; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
3132+
; SSE41-NEXT: retq
3133+
;
3134+
; AVX1-LABEL: PR45604:
3135+
; AVX1: # %bb.0:
3136+
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
3137+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
3138+
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3139+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
3140+
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3141+
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
3142+
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3143+
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3144+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
3145+
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
3146+
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3147+
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3148+
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3149+
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
3150+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
3151+
; AVX1-NEXT: vmovups %ymm0, (%rdi)
3152+
; AVX1-NEXT: vmovups %ymm1, 32(%rdi)
3153+
; AVX1-NEXT: vzeroupper
3154+
; AVX1-NEXT: retq
3155+
;
3156+
; AVX2-SLOW-LABEL: PR45604:
3157+
; AVX2-SLOW: # %bb.0:
3158+
; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0
3159+
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,0,0,0,0,0,0,0,0]
3160+
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
3161+
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
3162+
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255>
3163+
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11]
3164+
; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
3165+
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
3166+
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7]
3167+
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
3168+
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
3169+
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
3170+
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,u,u>
3171+
; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
3172+
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
3173+
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3174+
; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1
3175+
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
3176+
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
3177+
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
3178+
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
3179+
; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0
3180+
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7]
3181+
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
3182+
; AVX2-SLOW-NEXT: vmovdqu %ymm0, 32(%rdi)
3183+
; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rdi)
3184+
; AVX2-SLOW-NEXT: vzeroupper
3185+
; AVX2-SLOW-NEXT: retq
3186+
;
3187+
; AVX2-FAST-LABEL: PR45604:
3188+
; AVX2-FAST: # %bb.0:
3189+
; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0
3190+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,2,3,6,7,12,13,14,15,16,17,20,21,18,19,22,23,18,19,22,23,28,29,30,31]
3191+
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
3192+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[4,5,0,1,6,7,2,3,6,7,2,3,12,13,14,15,20,21,16,17,22,23,18,19,22,23,18,19,28,29,30,31]
3193+
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255>
3194+
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11]
3195+
; AVX2-FAST-NEXT: vpblendvb %ymm4, {{.*}}(%rip), %ymm5, %ymm4
3196+
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,u,u>
3197+
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
3198+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,4,5,6,7,10,11,14,15,10,11,14,15,24,25,28,29,20,21,22,23,26,27,30,31,26,27,30,31]
3199+
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[12,13,8,9,4,5,6,7,14,15,10,11,14,15,10,11,28,29,24,25,20,21,22,23,30,31,26,27,30,31,26,27]
3200+
; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
3201+
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
3202+
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7]
3203+
; AVX2-FAST-NEXT: vmovdqu %ymm0, 32(%rdi)
3204+
; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rdi)
3205+
; AVX2-FAST-NEXT: vzeroupper
3206+
; AVX2-FAST-NEXT: retq
3207+
%v1 = load <8 x i16>, <8 x i16>* %src, align 16
3208+
%v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3209+
%v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
3210+
store <32 x i16> %v3, <32 x i16>* %dst, align 16
3211+
ret void
3212+
}

0 commit comments

Comments
 (0)