6
6
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=X64,AVX512
7
7
; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86
8
8
9
+ declare i1 @llvm.vector.reduce.and.v2i1 (<2 x i1 >)
9
10
declare i1 @llvm.vector.reduce.and.v4i1 (<4 x i1 >)
10
11
declare i1 @llvm.vector.reduce.and.v8i1 (<8 x i1 >)
11
12
12
13
; FIXME: All four versions are semantically equivalent and should produce same asm as scalar version.
13
14
15
+ define i1 @intrinsic_v2i8 (ptr align 1 %arg , ptr align 1 %arg1 ) {
16
+ ; SSE2-LABEL: intrinsic_v2i8:
17
+ ; SSE2: # %bb.0: # %bb
18
+ ; SSE2-NEXT: movzwl (%rsi), %eax
19
+ ; SSE2-NEXT: movd %eax, %xmm0
20
+ ; SSE2-NEXT: movzwl (%rdi), %eax
21
+ ; SSE2-NEXT: movd %eax, %xmm1
22
+ ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
23
+ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
24
+ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
25
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
26
+ ; SSE2-NEXT: movmskpd %xmm0, %eax
27
+ ; SSE2-NEXT: cmpb $3, %al
28
+ ; SSE2-NEXT: sete %al
29
+ ; SSE2-NEXT: retq
30
+ ;
31
+ ; SSE42-LABEL: intrinsic_v2i8:
32
+ ; SSE42: # %bb.0: # %bb
33
+ ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
34
+ ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
35
+ ; SSE42-NEXT: psubq %xmm1, %xmm0
36
+ ; SSE42-NEXT: ptest %xmm0, %xmm0
37
+ ; SSE42-NEXT: sete %al
38
+ ; SSE42-NEXT: retq
39
+ ;
40
+ ; AVX-LABEL: intrinsic_v2i8:
41
+ ; AVX: # %bb.0: # %bb
42
+ ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
43
+ ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
44
+ ; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0
45
+ ; AVX-NEXT: vptest %xmm0, %xmm0
46
+ ; AVX-NEXT: sete %al
47
+ ; AVX-NEXT: retq
48
+ ;
49
+ ; AVX512-LABEL: intrinsic_v2i8:
50
+ ; AVX512: # %bb.0: # %bb
51
+ ; AVX512-NEXT: movzwl (%rsi), %eax
52
+ ; AVX512-NEXT: vmovd %eax, %xmm0
53
+ ; AVX512-NEXT: movzwl (%rdi), %eax
54
+ ; AVX512-NEXT: vmovd %eax, %xmm1
55
+ ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
56
+ ; AVX512-NEXT: knotw %k0, %k0
57
+ ; AVX512-NEXT: kmovd %k0, %eax
58
+ ; AVX512-NEXT: testb $3, %al
59
+ ; AVX512-NEXT: sete %al
60
+ ; AVX512-NEXT: retq
61
+ ;
62
+ ; X86-LABEL: intrinsic_v2i8:
63
+ ; X86: # %bb.0: # %bb
64
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
65
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
66
+ ; X86-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
67
+ ; X86-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
68
+ ; X86-NEXT: vpsubq %xmm1, %xmm0, %xmm0
69
+ ; X86-NEXT: vptest %xmm0, %xmm0
70
+ ; X86-NEXT: sete %al
71
+ ; X86-NEXT: retl
72
+ bb:
73
+ %lhs = load <2 x i8 >, ptr %arg1 , align 1
74
+ %rhs = load <2 x i8 >, ptr %arg , align 1
75
+ %cmp = icmp eq <2 x i8 > %lhs , %rhs
76
+ %all_eq = call i1 @llvm.vector.reduce.and.v2i1 (<2 x i1 > %cmp )
77
+ ret i1 %all_eq
78
+ }
79
+
14
80
define i1 @intrinsic_v4i8 (ptr align 1 %arg , ptr align 1 %arg1 ) {
15
81
; SSE2-LABEL: intrinsic_v4i8:
16
82
; SSE2: # %bb.0: # %bb
120
186
ret i1 %all_eq
121
187
}
122
188
123
- define i1 @vector_version (ptr align 1 %arg , ptr align 1 %arg1 ) {
124
- ; SSE2-LABEL: vector_version:
189
+ define i1 @vector_version_v2i8 (ptr align 1 %arg , ptr align 1 %arg1 ) {
190
+ ; SSE2-LABEL: vector_version_v2i8:
191
+ ; SSE2: # %bb.0: # %bb
192
+ ; SSE2-NEXT: movzwl (%rsi), %eax
193
+ ; SSE2-NEXT: movd %eax, %xmm0
194
+ ; SSE2-NEXT: movzwl (%rdi), %eax
195
+ ; SSE2-NEXT: movd %eax, %xmm1
196
+ ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
197
+ ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
198
+ ; SSE2-NEXT: pxor %xmm1, %xmm0
199
+ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
200
+ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
201
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
202
+ ; SSE2-NEXT: movmskpd %xmm0, %eax
203
+ ; SSE2-NEXT: testl %eax, %eax
204
+ ; SSE2-NEXT: sete %al
205
+ ; SSE2-NEXT: retq
206
+ ;
207
+ ; SSE42-LABEL: vector_version_v2i8:
208
+ ; SSE42: # %bb.0: # %bb
209
+ ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
210
+ ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
211
+ ; SSE42-NEXT: psubq %xmm1, %xmm0
212
+ ; SSE42-NEXT: ptest %xmm0, %xmm0
213
+ ; SSE42-NEXT: sete %al
214
+ ; SSE42-NEXT: retq
215
+ ;
216
+ ; AVX-LABEL: vector_version_v2i8:
217
+ ; AVX: # %bb.0: # %bb
218
+ ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
219
+ ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
220
+ ; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0
221
+ ; AVX-NEXT: vptest %xmm0, %xmm0
222
+ ; AVX-NEXT: sete %al
223
+ ; AVX-NEXT: retq
224
+ ;
225
+ ; AVX512-LABEL: vector_version_v2i8:
226
+ ; AVX512: # %bb.0: # %bb
227
+ ; AVX512-NEXT: movzwl (%rsi), %eax
228
+ ; AVX512-NEXT: vmovd %eax, %xmm0
229
+ ; AVX512-NEXT: movzwl (%rdi), %eax
230
+ ; AVX512-NEXT: vmovd %eax, %xmm1
231
+ ; AVX512-NEXT: vpcmpneqb %xmm1, %xmm0, %k0
232
+ ; AVX512-NEXT: kmovd %k0, %eax
233
+ ; AVX512-NEXT: testb $3, %al
234
+ ; AVX512-NEXT: sete %al
235
+ ; AVX512-NEXT: retq
236
+ ;
237
+ ; X86-LABEL: vector_version_v2i8:
238
+ ; X86: # %bb.0: # %bb
239
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
240
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
241
+ ; X86-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
242
+ ; X86-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
243
+ ; X86-NEXT: vpsubq %xmm1, %xmm0, %xmm0
244
+ ; X86-NEXT: vptest %xmm0, %xmm0
245
+ ; X86-NEXT: sete %al
246
+ ; X86-NEXT: retl
247
+ bb:
248
+ %lhs = load <2 x i8 >, ptr %arg1 , align 1
249
+ %rhs = load <2 x i8 >, ptr %arg , align 1
250
+ %any_ne = icmp ne <2 x i8 > %lhs , %rhs
251
+ %any_ne_scalar = bitcast <2 x i1 > %any_ne to i2
252
+ %all_eq = icmp eq i2 %any_ne_scalar , 0
253
+ ret i1 %all_eq
254
+ }
255
+
256
+ define i1 @vector_version_v4i8 (ptr align 1 %arg , ptr align 1 %arg1 ) {
257
+ ; SSE2-LABEL: vector_version_v4i8:
125
258
; SSE2: # %bb.0: # %bb
126
259
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
127
260
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -135,7 +268,7 @@ define i1 @vector_version(ptr align 1 %arg, ptr align 1 %arg1) {
135
268
; SSE2-NEXT: sete %al
136
269
; SSE2-NEXT: retq
137
270
;
138
- ; SSE42-LABEL: vector_version :
271
+ ; SSE42-LABEL: vector_version_v4i8 :
139
272
; SSE42: # %bb.0: # %bb
140
273
; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
141
274
; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -144,7 +277,7 @@ define i1 @vector_version(ptr align 1 %arg, ptr align 1 %arg1) {
144
277
; SSE42-NEXT: sete %al
145
278
; SSE42-NEXT: retq
146
279
;
147
- ; AVX-LABEL: vector_version :
280
+ ; AVX-LABEL: vector_version_v4i8 :
148
281
; AVX: # %bb.0: # %bb
149
282
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
150
283
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -153,7 +286,7 @@ define i1 @vector_version(ptr align 1 %arg, ptr align 1 %arg1) {
153
286
; AVX-NEXT: sete %al
154
287
; AVX-NEXT: retq
155
288
;
156
- ; AVX512-LABEL: vector_version :
289
+ ; AVX512-LABEL: vector_version_v4i8 :
157
290
; AVX512: # %bb.0: # %bb
158
291
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
159
292
; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -163,7 +296,7 @@ define i1 @vector_version(ptr align 1 %arg, ptr align 1 %arg1) {
163
296
; AVX512-NEXT: sete %al
164
297
; AVX512-NEXT: retq
165
298
;
166
- ; X86-LABEL: vector_version :
299
+ ; X86-LABEL: vector_version_v4i8 :
167
300
; X86: # %bb.0: # %bb
168
301
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
169
302
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
182
315
ret i1 %all_eq
183
316
}
184
317
318
+ define i1 @vector_version_v8i8 (ptr align 1 %arg , ptr align 1 %arg1 ) {
319
+ ; SSE-LABEL: vector_version_v8i8:
320
+ ; SSE: # %bb.0: # %bb
321
+ ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
322
+ ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
323
+ ; SSE-NEXT: pcmpeqb %xmm0, %xmm1
324
+ ; SSE-NEXT: pmovmskb %xmm1, %eax
325
+ ; SSE-NEXT: xorb $-1, %al
326
+ ; SSE-NEXT: sete %al
327
+ ; SSE-NEXT: retq
328
+ ;
329
+ ; AVX-LABEL: vector_version_v8i8:
330
+ ; AVX: # %bb.0: # %bb
331
+ ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
332
+ ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
333
+ ; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
334
+ ; AVX-NEXT: vpmovmskb %xmm0, %eax
335
+ ; AVX-NEXT: xorb $-1, %al
336
+ ; AVX-NEXT: sete %al
337
+ ; AVX-NEXT: retq
338
+ ;
339
+ ; AVX512-LABEL: vector_version_v8i8:
340
+ ; AVX512: # %bb.0: # %bb
341
+ ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
342
+ ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
343
+ ; AVX512-NEXT: vpcmpneqb %xmm1, %xmm0, %k0
344
+ ; AVX512-NEXT: kortestb %k0, %k0
345
+ ; AVX512-NEXT: sete %al
346
+ ; AVX512-NEXT: retq
347
+ ;
348
+ ; X86-LABEL: vector_version_v8i8:
349
+ ; X86: # %bb.0: # %bb
350
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
351
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
352
+ ; X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
353
+ ; X86-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
354
+ ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
355
+ ; X86-NEXT: vpmovmskb %xmm0, %eax
356
+ ; X86-NEXT: xorb $-1, %al
357
+ ; X86-NEXT: sete %al
358
+ ; X86-NEXT: retl
359
+ bb:
360
+ %lhs = load <8 x i8 >, ptr %arg1 , align 1
361
+ %rhs = load <8 x i8 >, ptr %arg , align 1
362
+ %any_ne = icmp ne <8 x i8 > %lhs , %rhs
363
+ %any_ne_scalar = bitcast <8 x i1 > %any_ne to i8
364
+ %all_eq = icmp eq i8 %any_ne_scalar , 0
365
+ ret i1 %all_eq
366
+ }
367
+
368
+ define i1 @mixed_version_v2i8 (ptr align 1 %arg , ptr align 1 %arg1 ) {
369
+ ; X64-LABEL: mixed_version_v2i8:
370
+ ; X64: # %bb.0: # %bb
371
+ ; X64-NEXT: movzwl (%rsi), %eax
372
+ ; X64-NEXT: cmpw (%rdi), %ax
373
+ ; X64-NEXT: sete %al
374
+ ; X64-NEXT: retq
375
+ ;
376
+ ; X86-LABEL: mixed_version_v2i8:
377
+ ; X86: # %bb.0: # %bb
378
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
379
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
380
+ ; X86-NEXT: movzwl (%ecx), %ecx
381
+ ; X86-NEXT: cmpw (%eax), %cx
382
+ ; X86-NEXT: sete %al
383
+ ; X86-NEXT: retl
384
+ bb:
385
+ %lhs = load <2 x i8 >, ptr %arg1 , align 1
386
+ %rhs = load <2 x i8 >, ptr %arg , align 1
387
+ %lhs_s = bitcast <2 x i8 > %lhs to i16
388
+ %rhs_s = bitcast <2 x i8 > %rhs to i16
389
+ %all_eq = icmp eq i16 %lhs_s , %rhs_s
390
+ ret i1 %all_eq
391
+ }
392
+
185
393
define i1 @mixed_version_v4i8 (ptr align 1 %arg , ptr align 1 %arg1 ) {
186
394
; X64-LABEL: mixed_version_v4i8:
187
395
; X64: # %bb.0: # %bb
@@ -235,15 +443,38 @@ bb:
235
443
ret i1 %all_eq
236
444
}
237
445
238
- define i1 @scalar_version (ptr align 1 %arg , ptr align 1 %arg1 ) {
239
- ; X64-LABEL: scalar_version:
446
+ define i1 @scalar_version_i16 (ptr align 1 %arg , ptr align 1 %arg1 ) {
447
+ ; X64-LABEL: scalar_version_i16:
448
+ ; X64: # %bb.0: # %bb
449
+ ; X64-NEXT: movzwl (%rsi), %eax
450
+ ; X64-NEXT: cmpw (%rdi), %ax
451
+ ; X64-NEXT: sete %al
452
+ ; X64-NEXT: retq
453
+ ;
454
+ ; X86-LABEL: scalar_version_i16:
455
+ ; X86: # %bb.0: # %bb
456
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
457
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
458
+ ; X86-NEXT: movzwl (%ecx), %ecx
459
+ ; X86-NEXT: cmpw (%eax), %cx
460
+ ; X86-NEXT: sete %al
461
+ ; X86-NEXT: retl
462
+ bb:
463
+ %lhs = load i16 , ptr %arg1 , align 1
464
+ %rhs = load i16 , ptr %arg , align 1
465
+ %all_eq = icmp eq i16 %lhs , %rhs
466
+ ret i1 %all_eq
467
+ }
468
+
469
+ define i1 @scalar_version_i32 (ptr align 1 %arg , ptr align 1 %arg1 ) {
470
+ ; X64-LABEL: scalar_version_i32:
240
471
; X64: # %bb.0: # %bb
241
472
; X64-NEXT: movl (%rsi), %eax
242
473
; X64-NEXT: cmpl (%rdi), %eax
243
474
; X64-NEXT: sete %al
244
475
; X64-NEXT: retq
245
476
;
246
- ; X86-LABEL: scalar_version :
477
+ ; X86-LABEL: scalar_version_i32 :
247
478
; X86: # %bb.0: # %bb
248
479
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
249
480
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
257
488
%all_eq = icmp eq i32 %lhs , %rhs
258
489
ret i1 %all_eq
259
490
}
491
+
492
+ define i1 @scalar_version_i64 (ptr align 1 %arg , ptr align 1 %arg1 ) {
493
+ ; X64-LABEL: scalar_version_i64:
494
+ ; X64: # %bb.0: # %bb
495
+ ; X64-NEXT: movq (%rsi), %rax
496
+ ; X64-NEXT: cmpq (%rdi), %rax
497
+ ; X64-NEXT: sete %al
498
+ ; X64-NEXT: retq
499
+ ;
500
+ ; X86-LABEL: scalar_version_i64:
501
+ ; X86: # %bb.0: # %bb
502
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
503
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
504
+ ; X86-NEXT: movl (%ecx), %edx
505
+ ; X86-NEXT: movl 4(%ecx), %ecx
506
+ ; X86-NEXT: xorl 4(%eax), %ecx
507
+ ; X86-NEXT: xorl (%eax), %edx
508
+ ; X86-NEXT: orl %ecx, %edx
509
+ ; X86-NEXT: sete %al
510
+ ; X86-NEXT: retl
511
+ bb:
512
+ %lhs = load i64 , ptr %arg1 , align 1
513
+ %rhs = load i64 , ptr %arg , align 1
514
+ %all_eq = icmp eq i64 %lhs , %rhs
515
+ ret i1 %all_eq
516
+ }
0 commit comments