|
2 | 2 | ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE,SSE2
|
3 | 3 | ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE,SSE42
|
4 | 4 | ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
|
5 |
| -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 |
6 |
| -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX512 |
| 5 | +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512,AVX512-V4 |
| 6 | +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512-VBMI |
7 | 7 |
|
8 | 8 | define i4 @reverse_cmp_v4i1(<4 x i32> %a0, <4 x i32> %a1) {
|
9 | 9 | ; SSE2-LABEL: reverse_cmp_v4i1:
|
@@ -221,6 +221,28 @@ define i32 @reverse_cmp_v32i1(<32 x i8> %a0, <32 x i8> %a1) {
|
221 | 221 | ; AVX2-NEXT: vpmovmskb %ymm0, %eax
|
222 | 222 | ; AVX2-NEXT: vzeroupper
|
223 | 223 | ; AVX2-NEXT: retq
|
| 224 | +; |
| 225 | +; AVX512-V4-LABEL: reverse_cmp_v32i1: |
| 226 | +; AVX512-V4: # %bb.0: |
| 227 | +; AVX512-V4-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 |
| 228 | +; AVX512-V4-NEXT: vpmovm2b %k0, %ymm0 |
| 229 | +; AVX512-V4-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] |
| 230 | +; AVX512-V4-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] |
| 231 | +; AVX512-V4-NEXT: vpmovb2m %ymm0, %k0 |
| 232 | +; AVX512-V4-NEXT: kmovd %k0, %eax |
| 233 | +; AVX512-V4-NEXT: vzeroupper |
| 234 | +; AVX512-V4-NEXT: retq |
| 235 | +; |
| 236 | +; AVX512-VBMI-LABEL: reverse_cmp_v32i1: |
| 237 | +; AVX512-VBMI: # %bb.0: |
| 238 | +; AVX512-VBMI-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 |
| 239 | +; AVX512-VBMI-NEXT: vpmovm2b %k0, %ymm0 |
| 240 | +; AVX512-VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] |
| 241 | +; AVX512-VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 |
| 242 | +; AVX512-VBMI-NEXT: vpmovb2m %ymm0, %k0 |
| 243 | +; AVX512-VBMI-NEXT: kmovd %k0, %eax |
| 244 | +; AVX512-VBMI-NEXT: vzeroupper |
| 245 | +; AVX512-VBMI-NEXT: retq |
224 | 246 | %cmp = icmp eq <32 x i8> %a0, %a1
|
225 | 247 | %mask = bitcast <32 x i1> %cmp to i32
|
226 | 248 | %rev = tail call i32 @llvm.bitreverse.i32(i32 %mask)
|
@@ -306,6 +328,28 @@ define i64 @reverse_cmp_v64i1(<64 x i8> %a0, <64 x i8> %a1) {
|
306 | 328 | ; AVX2-NEXT: orq %rcx, %rax
|
307 | 329 | ; AVX2-NEXT: vzeroupper
|
308 | 330 | ; AVX2-NEXT: retq
|
| 331 | +; |
| 332 | +; AVX512-V4-LABEL: reverse_cmp_v64i1: |
| 333 | +; AVX512-V4: # %bb.0: |
| 334 | +; AVX512-V4-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 |
| 335 | +; AVX512-V4-NEXT: vpmovm2b %k0, %zmm0 |
| 336 | +; AVX512-V4-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48] |
| 337 | +; AVX512-V4-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] |
| 338 | +; AVX512-V4-NEXT: vpmovb2m %zmm0, %k0 |
| 339 | +; AVX512-V4-NEXT: kmovq %k0, %rax |
| 340 | +; AVX512-V4-NEXT: vzeroupper |
| 341 | +; AVX512-V4-NEXT: retq |
| 342 | +; |
| 343 | +; AVX512-VBMI-LABEL: reverse_cmp_v64i1: |
| 344 | +; AVX512-VBMI: # %bb.0: |
| 345 | +; AVX512-VBMI-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 |
| 346 | +; AVX512-VBMI-NEXT: vpmovm2b %k0, %zmm0 |
| 347 | +; AVX512-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] |
| 348 | +; AVX512-VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 |
| 349 | +; AVX512-VBMI-NEXT: vpmovb2m %zmm0, %k0 |
| 350 | +; AVX512-VBMI-NEXT: kmovq %k0, %rax |
| 351 | +; AVX512-VBMI-NEXT: vzeroupper |
| 352 | +; AVX512-VBMI-NEXT: retq |
309 | 353 | %cmp = icmp eq <64 x i8> %a0, %a1
|
310 | 354 | %mask = bitcast <64 x i1> %cmp to i64
|
311 | 355 | %rev = tail call i64 @llvm.bitreverse.i64(i64 %mask)
|
|
0 commit comments