@@ -3026,3 +3026,187 @@ define void @PR43024() {
3026
3026
store float %8 , float * undef , align 8
3027
3027
ret void
3028
3028
}
3029
+
3030
+ ; TODO - we're ignoring the i32->i16->i32 'ZERO_EXTEND_INREG' pattern, resulting in an bad movss .
3031
+ define void @PR45604 (<32 x i16 >* %dst , <8 x i16 >* %src ) {
3032
+ ; SSE2-LABEL: PR45604:
3033
+ ; SSE2: # %bb.0:
3034
+ ; SSE2-NEXT: movdqa (%rsi), %xmm1
3035
+ ; SSE2-NEXT: pextrw $2, %xmm1, %eax
3036
+ ; SSE2-NEXT: movd %eax, %xmm0
3037
+ ; SSE2-NEXT: movl $11, %eax
3038
+ ; SSE2-NEXT: pinsrw $2, %eax, %xmm0
3039
+ ; SSE2-NEXT: pextrw $3, %xmm1, %ecx
3040
+ ; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
3041
+ ; SSE2-NEXT: pinsrw $6, %eax, %xmm0
3042
+ ; SSE2-NEXT: pextrw $4, %xmm1, %ecx
3043
+ ; SSE2-NEXT: movd %ecx, %xmm2
3044
+ ; SSE2-NEXT: pinsrw $2, %eax, %xmm2
3045
+ ; SSE2-NEXT: pextrw $5, %xmm1, %ecx
3046
+ ; SSE2-NEXT: pinsrw $4, %ecx, %xmm2
3047
+ ; SSE2-NEXT: pinsrw $6, %eax, %xmm2
3048
+ ; SSE2-NEXT: pextrw $6, %xmm1, %ecx
3049
+ ; SSE2-NEXT: movd %ecx, %xmm3
3050
+ ; SSE2-NEXT: pinsrw $2, %eax, %xmm3
3051
+ ; SSE2-NEXT: pextrw $7, %xmm1, %ecx
3052
+ ; SSE2-NEXT: pinsrw $4, %ecx, %xmm3
3053
+ ; SSE2-NEXT: pinsrw $6, %eax, %xmm3
3054
+ ; SSE2-NEXT: xorps %xmm4, %xmm4
3055
+ ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
3056
+ ; SSE2-NEXT: pinsrw $2, %eax, %xmm4
3057
+ ; SSE2-NEXT: pextrw $1, %xmm1, %ecx
3058
+ ; SSE2-NEXT: pinsrw $4, %ecx, %xmm4
3059
+ ; SSE2-NEXT: pinsrw $6, %eax, %xmm4
3060
+ ; SSE2-NEXT: movdqa %xmm4, (%rdi)
3061
+ ; SSE2-NEXT: movdqa %xmm3, 48(%rdi)
3062
+ ; SSE2-NEXT: movdqa %xmm2, 32(%rdi)
3063
+ ; SSE2-NEXT: movdqa %xmm0, 16(%rdi)
3064
+ ; SSE2-NEXT: retq
3065
+ ;
3066
+ ; SSSE3-LABEL: PR45604:
3067
+ ; SSSE3: # %bb.0:
3068
+ ; SSSE3-NEXT: movdqa (%rsi), %xmm1
3069
+ ; SSSE3-NEXT: pextrw $2, %xmm1, %eax
3070
+ ; SSSE3-NEXT: movd %eax, %xmm0
3071
+ ; SSSE3-NEXT: movl $11, %eax
3072
+ ; SSSE3-NEXT: pinsrw $2, %eax, %xmm0
3073
+ ; SSSE3-NEXT: pextrw $3, %xmm1, %ecx
3074
+ ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
3075
+ ; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
3076
+ ; SSSE3-NEXT: pextrw $4, %xmm1, %ecx
3077
+ ; SSSE3-NEXT: movd %ecx, %xmm2
3078
+ ; SSSE3-NEXT: pinsrw $2, %eax, %xmm2
3079
+ ; SSSE3-NEXT: pextrw $5, %xmm1, %ecx
3080
+ ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2
3081
+ ; SSSE3-NEXT: pinsrw $6, %eax, %xmm2
3082
+ ; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
3083
+ ; SSSE3-NEXT: movd %ecx, %xmm3
3084
+ ; SSSE3-NEXT: pinsrw $2, %eax, %xmm3
3085
+ ; SSSE3-NEXT: pextrw $7, %xmm1, %ecx
3086
+ ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3
3087
+ ; SSSE3-NEXT: pinsrw $6, %eax, %xmm3
3088
+ ; SSSE3-NEXT: xorps %xmm4, %xmm4
3089
+ ; SSSE3-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
3090
+ ; SSSE3-NEXT: pinsrw $2, %eax, %xmm4
3091
+ ; SSSE3-NEXT: pextrw $1, %xmm1, %ecx
3092
+ ; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4
3093
+ ; SSSE3-NEXT: pinsrw $6, %eax, %xmm4
3094
+ ; SSSE3-NEXT: movdqa %xmm4, (%rdi)
3095
+ ; SSSE3-NEXT: movdqa %xmm3, 48(%rdi)
3096
+ ; SSSE3-NEXT: movdqa %xmm2, 32(%rdi)
3097
+ ; SSSE3-NEXT: movdqa %xmm0, 16(%rdi)
3098
+ ; SSSE3-NEXT: retq
3099
+ ;
3100
+ ; SSE41-LABEL: PR45604:
3101
+ ; SSE41: # %bb.0:
3102
+ ; SSE41-NEXT: movdqa (%rsi), %xmm1
3103
+ ; SSE41-NEXT: pextrw $2, %xmm1, %eax
3104
+ ; SSE41-NEXT: movd %eax, %xmm0
3105
+ ; SSE41-NEXT: movl $11, %eax
3106
+ ; SSE41-NEXT: pinsrw $2, %eax, %xmm0
3107
+ ; SSE41-NEXT: pextrw $3, %xmm1, %ecx
3108
+ ; SSE41-NEXT: pinsrw $4, %ecx, %xmm0
3109
+ ; SSE41-NEXT: pinsrw $6, %eax, %xmm0
3110
+ ; SSE41-NEXT: pextrw $4, %xmm1, %ecx
3111
+ ; SSE41-NEXT: movd %ecx, %xmm2
3112
+ ; SSE41-NEXT: pinsrw $2, %eax, %xmm2
3113
+ ; SSE41-NEXT: pextrw $5, %xmm1, %ecx
3114
+ ; SSE41-NEXT: pinsrw $4, %ecx, %xmm2
3115
+ ; SSE41-NEXT: pinsrw $6, %eax, %xmm2
3116
+ ; SSE41-NEXT: pextrw $6, %xmm1, %ecx
3117
+ ; SSE41-NEXT: movd %ecx, %xmm3
3118
+ ; SSE41-NEXT: pinsrw $2, %eax, %xmm3
3119
+ ; SSE41-NEXT: pextrw $7, %xmm1, %ecx
3120
+ ; SSE41-NEXT: pinsrw $4, %ecx, %xmm3
3121
+ ; SSE41-NEXT: pinsrw $6, %eax, %xmm3
3122
+ ; SSE41-NEXT: pxor %xmm4, %xmm4
3123
+ ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3,4,5,6,7]
3124
+ ; SSE41-NEXT: pinsrw $2, %eax, %xmm4
3125
+ ; SSE41-NEXT: pextrw $1, %xmm1, %ecx
3126
+ ; SSE41-NEXT: pinsrw $4, %ecx, %xmm4
3127
+ ; SSE41-NEXT: pinsrw $6, %eax, %xmm4
3128
+ ; SSE41-NEXT: movdqa %xmm4, (%rdi)
3129
+ ; SSE41-NEXT: movdqa %xmm3, 48(%rdi)
3130
+ ; SSE41-NEXT: movdqa %xmm2, 32(%rdi)
3131
+ ; SSE41-NEXT: movdqa %xmm0, 16(%rdi)
3132
+ ; SSE41-NEXT: retq
3133
+ ;
3134
+ ; AVX1-LABEL: PR45604:
3135
+ ; AVX1: # %bb.0:
3136
+ ; AVX1-NEXT: vmovdqa (%rsi), %xmm0
3137
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
3138
+ ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3139
+ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
3140
+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3141
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
3142
+ ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3143
+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3144
+ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
3145
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
3146
+ ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3147
+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3148
+ ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3149
+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
3150
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
3151
+ ; AVX1-NEXT: vmovups %ymm0, (%rdi)
3152
+ ; AVX1-NEXT: vmovups %ymm1, 32(%rdi)
3153
+ ; AVX1-NEXT: vzeroupper
3154
+ ; AVX1-NEXT: retq
3155
+ ;
3156
+ ; AVX2-SLOW-LABEL: PR45604:
3157
+ ; AVX2-SLOW: # %bb.0:
3158
+ ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0
3159
+ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,0,0,0,0,0,0,0,0]
3160
+ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
3161
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
3162
+ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255>
3163
+ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11]
3164
+ ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
3165
+ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
3166
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7]
3167
+ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
3168
+ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15]
3169
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
3170
+ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,u,u>
3171
+ ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
3172
+ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
3173
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
3174
+ ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1
3175
+ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
3176
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
3177
+ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13]
3178
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
3179
+ ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0
3180
+ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7]
3181
+ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
3182
+ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, 32(%rdi)
3183
+ ; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rdi)
3184
+ ; AVX2-SLOW-NEXT: vzeroupper
3185
+ ; AVX2-SLOW-NEXT: retq
3186
+ ;
3187
+ ; AVX2-FAST-LABEL: PR45604:
3188
+ ; AVX2-FAST: # %bb.0:
3189
+ ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0
3190
+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,2,3,6,7,12,13,14,15,16,17,20,21,18,19,22,23,18,19,22,23,28,29,30,31]
3191
+ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
3192
+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[4,5,0,1,6,7,2,3,6,7,2,3,12,13,14,15,20,21,16,17,22,23,18,19,22,23,18,19,28,29,30,31]
3193
+ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255>
3194
+ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,0,0,0,0,11,11,11,11,11,11,11,11]
3195
+ ; AVX2-FAST-NEXT: vpblendvb %ymm4, {{.*}}(%rip), %ymm5, %ymm4
3196
+ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,255,255,0,0,u,u,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,u,u>
3197
+ ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
3198
+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,4,5,6,7,10,11,14,15,10,11,14,15,24,25,28,29,20,21,22,23,26,27,30,31,26,27,30,31]
3199
+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[12,13,8,9,4,5,6,7,14,15,10,11,14,15,10,11,28,29,24,25,20,21,22,23,30,31,26,27,30,31,26,27]
3200
+ ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
3201
+ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7]
3202
+ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7]
3203
+ ; AVX2-FAST-NEXT: vmovdqu %ymm0, 32(%rdi)
3204
+ ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rdi)
3205
+ ; AVX2-FAST-NEXT: vzeroupper
3206
+ ; AVX2-FAST-NEXT: retq
3207
+ %v1 = load <8 x i16 >, <8 x i16 >* %src , align 16
3208
+ %v2 = shufflevector <8 x i16 > %v1 , <8 x i16 > zeroinitializer , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 >
3209
+ %v3 = shufflevector <16 x i16 > %v2 , <16 x i16 > <i16 11 , i16 11 , i16 11 , i16 11 , i16 11 , i16 11 , i16 11 , i16 11 , i16 0 , i16 0 , i16 0 , i16 0 , i16 0 , i16 0 , i16 0 , i16 0 >, <32 x i32 > <i32 0 , i32 8 , i32 16 , i32 24 , i32 1 , i32 9 , i32 17 , i32 25 , i32 2 , i32 10 , i32 18 , i32 26 , i32 3 , i32 11 , i32 19 , i32 27 , i32 4 , i32 12 , i32 20 , i32 28 , i32 5 , i32 13 , i32 21 , i32 29 , i32 6 , i32 14 , i32 22 , i32 30 , i32 7 , i32 15 , i32 23 , i32 31 >
3210
+ store <32 x i16 > %v3 , <32 x i16 >* %dst , align 16
3211
+ ret void
3212
+ }
0 commit comments