|
4 | 4 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
|
5 | 5 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
|
6 | 6 | ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
|
7 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5 |
8 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6 |
9 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7 |
10 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8 |
11 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9 |
12 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10 |
13 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11 |
14 |
| -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12 |
| 7 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5 |
| 8 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6 |
| 9 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7 |
| 10 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8 |
| 11 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9 |
| 12 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10 |
| 13 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11 |
| 14 | +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12 |
15 | 15 |
|
16 | 16 | ; These patterns are produced by LoopVectorizer for interleaved loads.
|
17 | 17 |
|
@@ -46,59 +46,32 @@ define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
|
46 | 46 | ; AVX1-NEXT: vmovd %xmm0, (%r8)
|
47 | 47 | ; AVX1-NEXT: retq
|
48 | 48 | ;
|
49 |
| -; AVX512F-SLOW-LABEL: load_i16_stride4_vf2: |
50 |
| -; AVX512F-SLOW: # %bb.0: |
51 |
| -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 |
52 |
| -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] |
53 |
| -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] |
54 |
| -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] |
55 |
| -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] |
56 |
| -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] |
57 |
| -; AVX512F-SLOW-NEXT: vpmovqw %xmm0, (%rsi) |
58 |
| -; AVX512F-SLOW-NEXT: vmovd %xmm1, (%rdx) |
59 |
| -; AVX512F-SLOW-NEXT: vmovd %xmm3, (%rcx) |
60 |
| -; AVX512F-SLOW-NEXT: vmovd %xmm2, (%r8) |
61 |
| -; AVX512F-SLOW-NEXT: retq |
62 |
| -; |
63 |
| -; AVX512F-FAST-LABEL: load_i16_stride4_vf2: |
64 |
| -; AVX512F-FAST: # %bb.0: |
65 |
| -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 |
66 |
| -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] |
67 |
| -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] |
68 |
| -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] |
69 |
| -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] |
70 |
| -; AVX512F-FAST-NEXT: vpmovqw %xmm0, (%rsi) |
71 |
| -; AVX512F-FAST-NEXT: vmovd %xmm1, (%rdx) |
72 |
| -; AVX512F-FAST-NEXT: vmovd %xmm3, (%rcx) |
73 |
| -; AVX512F-FAST-NEXT: vmovd %xmm2, (%r8) |
74 |
| -; AVX512F-FAST-NEXT: retq |
75 |
| -; |
76 |
| -; AVX512BW-SLOW-LABEL: load_i16_stride4_vf2: |
77 |
| -; AVX512BW-SLOW: # %bb.0: |
78 |
| -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 |
79 |
| -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] |
80 |
| -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] |
81 |
| -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] |
82 |
| -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] |
83 |
| -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] |
84 |
| -; AVX512BW-SLOW-NEXT: vpmovqw %xmm0, (%rsi) |
85 |
| -; AVX512BW-SLOW-NEXT: vmovd %xmm1, (%rdx) |
86 |
| -; AVX512BW-SLOW-NEXT: vmovd %xmm3, (%rcx) |
87 |
| -; AVX512BW-SLOW-NEXT: vmovd %xmm2, (%r8) |
88 |
| -; AVX512BW-SLOW-NEXT: retq |
| 49 | +; AVX512-SLOW-LABEL: load_i16_stride4_vf2: |
| 50 | +; AVX512-SLOW: # %bb.0: |
| 51 | +; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0 |
| 52 | +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] |
| 53 | +; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] |
| 54 | +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] |
| 55 | +; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] |
| 56 | +; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] |
| 57 | +; AVX512-SLOW-NEXT: vpmovqw %xmm0, (%rsi) |
| 58 | +; AVX512-SLOW-NEXT: vmovd %xmm1, (%rdx) |
| 59 | +; AVX512-SLOW-NEXT: vmovd %xmm3, (%rcx) |
| 60 | +; AVX512-SLOW-NEXT: vmovd %xmm2, (%r8) |
| 61 | +; AVX512-SLOW-NEXT: retq |
89 | 62 | ;
|
90 |
| -; AVX512BW-FAST-LABEL: load_i16_stride4_vf2: |
91 |
| -; AVX512BW-FAST: # %bb.0: |
92 |
| -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 |
93 |
| -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] |
94 |
| -; AVX512BW-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] |
95 |
| -; AVX512BW-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] |
96 |
| -; AVX512BW-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] |
97 |
| -; AVX512BW-FAST-NEXT: vpmovqw %xmm0, (%rsi) |
98 |
| -; AVX512BW-FAST-NEXT: vmovd %xmm1, (%rdx) |
99 |
| -; AVX512BW-FAST-NEXT: vmovd %xmm3, (%rcx) |
100 |
| -; AVX512BW-FAST-NEXT: vmovd %xmm2, (%r8) |
101 |
| -; AVX512BW-FAST-NEXT: retq |
| 63 | +; AVX512-FAST-LABEL: load_i16_stride4_vf2: |
| 64 | +; AVX512-FAST: # %bb.0: |
| 65 | +; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0 |
| 66 | +; AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u] |
| 67 | +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] |
| 68 | +; AVX512-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] |
| 69 | +; AVX512-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] |
| 70 | +; AVX512-FAST-NEXT: vpmovqw %xmm0, (%rsi) |
| 71 | +; AVX512-FAST-NEXT: vmovd %xmm1, (%rdx) |
| 72 | +; AVX512-FAST-NEXT: vmovd %xmm3, (%rcx) |
| 73 | +; AVX512-FAST-NEXT: vmovd %xmm2, (%r8) |
| 74 | +; AVX512-FAST-NEXT: retq |
102 | 75 | %wide.vec = load <8 x i16>, ptr %in.vec, align 64
|
103 | 76 | %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 0, i32 4>
|
104 | 77 | %strided.vec1 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 1, i32 5>
|
@@ -5451,8 +5424,10 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
|
5451 | 5424 | ; AVX: {{.*}}
|
5452 | 5425 | ; AVX2: {{.*}}
|
5453 | 5426 | ; AVX2-ONLY: {{.*}}
|
| 5427 | +; AVX512BW-FAST: {{.*}} |
5454 | 5428 | ; AVX512BW-ONLY-FAST: {{.*}}
|
5455 | 5429 | ; AVX512BW-ONLY-SLOW: {{.*}}
|
| 5430 | +; AVX512BW-SLOW: {{.*}} |
5456 | 5431 | ; AVX512DQ-FAST: {{.*}}
|
5457 | 5432 | ; AVX512DQ-SLOW: {{.*}}
|
5458 | 5433 | ; AVX512DQBW-FAST: {{.*}}
|
|
0 commit comments