@@ -1132,28 +1132,50 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
1132
1132
; AVX1-NEXT: vmovdqu %xmm2, 16(%rdi)
1133
1133
; AVX1-NEXT: retq
1134
1134
;
1135
- ; AVX2-LABEL: interleave_24i16_in:
1136
- ; AVX2: # %bb.0:
1137
- ; AVX2-NEXT: vmovdqu (%rsi), %xmm0
1138
- ; AVX2-NEXT: vmovdqu (%rdx), %xmm1
1139
- ; AVX2-NEXT: vmovdqu (%rcx), %xmm2
1140
- ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1141
- ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
1142
- ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1143
- ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1144
- ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1145
- ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1146
- ; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4
1147
- ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1148
- ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1149
- ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1150
- ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1151
- ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1152
- ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1153
- ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdi)
1154
- ; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
1155
- ; AVX2-NEXT: vzeroupper
1156
- ; AVX2-NEXT: retq
1135
+ ; AVX2-SLOW-LABEL: interleave_24i16_in:
1136
+ ; AVX2-SLOW: # %bb.0:
1137
+ ; AVX2-SLOW-NEXT: vmovdqu (%rsi), %xmm0
1138
+ ; AVX2-SLOW-NEXT: vmovdqu (%rdx), %xmm1
1139
+ ; AVX2-SLOW-NEXT: vmovdqu (%rcx), %xmm2
1140
+ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1141
+ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,18,19,u,u,24,25,20,21,u,u]
1142
+ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
1143
+ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
1144
+ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
1145
+ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1146
+ ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
1147
+ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1148
+ ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1149
+ ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1150
+ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1151
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1152
+ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1153
+ ; AVX2-SLOW-NEXT: vmovdqu %xmm0, 32(%rdi)
1154
+ ; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rdi)
1155
+ ; AVX2-SLOW-NEXT: vzeroupper
1156
+ ; AVX2-SLOW-NEXT: retq
1157
+ ;
1158
+ ; AVX2-FAST-LABEL: interleave_24i16_in:
1159
+ ; AVX2-FAST: # %bb.0:
1160
+ ; AVX2-FAST-NEXT: vmovdqu (%rsi), %xmm0
1161
+ ; AVX2-FAST-NEXT: vmovdqu (%rdx), %xmm1
1162
+ ; AVX2-FAST-NEXT: vmovdqu (%rcx), %xmm2
1163
+ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
1164
+ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
1165
+ ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4
1166
+ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
1167
+ ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3
1168
+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27]
1169
+ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
1170
+ ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1171
+ ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1172
+ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
1173
+ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1174
+ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1175
+ ; AVX2-FAST-NEXT: vmovdqu %xmm0, 32(%rdi)
1176
+ ; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rdi)
1177
+ ; AVX2-FAST-NEXT: vzeroupper
1178
+ ; AVX2-FAST-NEXT: retq
1157
1179
;
1158
1180
; XOP-LABEL: interleave_24i16_in:
1159
1181
; XOP: # %bb.0:
0 commit comments