@@ -159,26 +159,44 @@ define void @store_i64_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
159
159
; AVX2-ONLY-NEXT: vzeroupper
160
160
; AVX2-ONLY-NEXT: retq
161
161
;
162
- ; AVX512-LABEL: store_i64_stride4_vf4:
163
- ; AVX512: # %bb.0:
164
- ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
165
- ; AVX512-NEXT: vmovdqa (%rsi), %ymm1
166
- ; AVX512-NEXT: vmovdqa (%rdx), %ymm2
167
- ; AVX512-NEXT: vmovdqa (%rcx), %ymm3
168
- ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
169
- ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
170
- ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
171
- ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
172
- ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
173
- ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
174
- ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
175
- ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
176
- ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1
177
- ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
178
- ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r8)
179
- ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8)
180
- ; AVX512-NEXT: vzeroupper
181
- ; AVX512-NEXT: retq
162
+ ; AVX512-SLOW-LABEL: store_i64_stride4_vf4:
163
+ ; AVX512-SLOW: # %bb.0:
164
+ ; AVX512-SLOW-NEXT: vmovdqa (%rdi), %ymm0
165
+ ; AVX512-SLOW-NEXT: vmovdqa (%rsi), %ymm1
166
+ ; AVX512-SLOW-NEXT: vmovdqa (%rdx), %ymm2
167
+ ; AVX512-SLOW-NEXT: vmovdqa (%rcx), %ymm3
168
+ ; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
169
+ ; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
170
+ ; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
171
+ ; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
172
+ ; AVX512-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
173
+ ; AVX512-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
174
+ ; AVX512-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
175
+ ; AVX512-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
176
+ ; AVX512-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1
177
+ ; AVX512-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
178
+ ; AVX512-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r8)
179
+ ; AVX512-SLOW-NEXT: vmovdqa64 %zmm1, (%r8)
180
+ ; AVX512-SLOW-NEXT: vzeroupper
181
+ ; AVX512-SLOW-NEXT: retq
182
+ ;
183
+ ; AVX512-FAST-LABEL: store_i64_stride4_vf4:
184
+ ; AVX512-FAST: # %bb.0:
185
+ ; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm0
186
+ ; AVX512-FAST-NEXT: vmovdqa (%rsi), %ymm1
187
+ ; AVX512-FAST-NEXT: vmovdqa (%rdx), %ymm2
188
+ ; AVX512-FAST-NEXT: vmovdqa (%rcx), %ymm3
189
+ ; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
190
+ ; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
191
+ ; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
192
+ ; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
193
+ ; AVX512-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,10,1,9,3,11]
194
+ ; AVX512-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
195
+ ; AVX512-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
196
+ ; AVX512-FAST-NEXT: vmovdqa64 %zmm0, 64(%r8)
197
+ ; AVX512-FAST-NEXT: vmovdqa64 %zmm4, (%r8)
198
+ ; AVX512-FAST-NEXT: vzeroupper
199
+ ; AVX512-FAST-NEXT: retq
182
200
%in.vec0 = load <4 x i64 >, ptr %in.vecptr0 , align 64
183
201
%in.vec1 = load <4 x i64 >, ptr %in.vecptr1 , align 64
184
202
%in.vec2 = load <4 x i64 >, ptr %in.vecptr2 , align 64
0 commit comments