@@ -269,6 +269,164 @@ define <4 x i64> @bitselect_v4i64_mm(<4 x i64>* nocapture readonly, <4 x i64>* n
269
269
ret <4 x i64 > %7
270
270
}
271
271
272
+ define <4 x i64 > @bitselect_v4i64_broadcast_rrr (<4 x i64 > %a0 , <4 x i64 > %a1 , i64 %a2 ) {
273
+ ; SSE-LABEL: bitselect_v4i64_broadcast_rrr:
274
+ ; SSE: # %bb.0:
275
+ ; SSE-NEXT: movq %rdi, %xmm4
276
+ ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
277
+ ; SSE-NEXT: pcmpeqd %xmm6, %xmm6
278
+ ; SSE-NEXT: pxor %xmm4, %xmm6
279
+ ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1]
280
+ ; SSE-NEXT: pand %xmm5, %xmm1
281
+ ; SSE-NEXT: pand %xmm5, %xmm0
282
+ ; SSE-NEXT: pand %xmm4, %xmm3
283
+ ; SSE-NEXT: por %xmm3, %xmm1
284
+ ; SSE-NEXT: pand %xmm4, %xmm2
285
+ ; SSE-NEXT: por %xmm2, %xmm0
286
+ ; SSE-NEXT: retq
287
+ ;
288
+ ; XOP-LABEL: bitselect_v4i64_broadcast_rrr:
289
+ ; XOP: # %bb.0:
290
+ ; XOP-NEXT: vmovq %rdi, %xmm2
291
+ ; XOP-NEXT: vmovq %rdi, %xmm3
292
+ ; XOP-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
293
+ ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
294
+ ; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
295
+ ; XOP-NEXT: vpxor %xmm4, %xmm3, %xmm3
296
+ ; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
297
+ ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
298
+ ; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0
299
+ ; XOP-NEXT: vandps %ymm3, %ymm1, %ymm1
300
+ ; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0
301
+ ; XOP-NEXT: retq
302
+ ;
303
+ ; AVX1-LABEL: bitselect_v4i64_broadcast_rrr:
304
+ ; AVX1: # %bb.0:
305
+ ; AVX1-NEXT: vmovq %rdi, %xmm2
306
+ ; AVX1-NEXT: vmovq %rdi, %xmm3
307
+ ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
308
+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
309
+ ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
310
+ ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
311
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
312
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
313
+ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
314
+ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
315
+ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
316
+ ; AVX1-NEXT: retq
317
+ ;
318
+ ; AVX2-LABEL: bitselect_v4i64_broadcast_rrr:
319
+ ; AVX2: # %bb.0:
320
+ ; AVX2-NEXT: vmovq %rdi, %xmm2
321
+ ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm3
322
+ ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
323
+ ; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2
324
+ ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2
325
+ ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
326
+ ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
327
+ ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
328
+ ; AVX2-NEXT: retq
329
+ ;
330
+ ; AVX512F-LABEL: bitselect_v4i64_broadcast_rrr:
331
+ ; AVX512F: # %bb.0:
332
+ ; AVX512F-NEXT: vmovq %rdi, %xmm2
333
+ ; AVX512F-NEXT: vmovq %rdi, %xmm3
334
+ ; AVX512F-NEXT: vpbroadcastq %xmm3, %ymm3
335
+ ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
336
+ ; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
337
+ ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
338
+ ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
339
+ ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
340
+ ; AVX512F-NEXT: retq
341
+ %1 = insertelement <4 x i64 > undef , i64 %a2 , i32 0
342
+ %2 = shufflevector <4 x i64 > %1 , <4 x i64 > undef , <4 x i32 > zeroinitializer
343
+ %3 = xor <4 x i64 > %1 , <i64 -1 , i64 undef , i64 undef , i64 undef >
344
+ %4 = shufflevector <4 x i64 > %3 , <4 x i64 > undef , <4 x i32 > zeroinitializer
345
+ %5 = and <4 x i64 > %a0 , %2
346
+ %6 = and <4 x i64 > %a1 , %4
347
+ %7 = or <4 x i64 > %5 , %6
348
+ ret <4 x i64 > %7
349
+ }
350
+
351
+ define <4 x i64 > @bitselect_v4i64_broadcast_rrm (<4 x i64 > %a0 , <4 x i64 > %a1 , i64* %p2 ) {
352
+ ; SSE-LABEL: bitselect_v4i64_broadcast_rrm:
353
+ ; SSE: # %bb.0:
354
+ ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
355
+ ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
356
+ ; SSE-NEXT: pcmpeqd %xmm6, %xmm6
357
+ ; SSE-NEXT: pxor %xmm4, %xmm6
358
+ ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1]
359
+ ; SSE-NEXT: pand %xmm5, %xmm1
360
+ ; SSE-NEXT: pand %xmm5, %xmm0
361
+ ; SSE-NEXT: pand %xmm4, %xmm3
362
+ ; SSE-NEXT: por %xmm3, %xmm1
363
+ ; SSE-NEXT: pand %xmm4, %xmm2
364
+ ; SSE-NEXT: por %xmm2, %xmm0
365
+ ; SSE-NEXT: retq
366
+ ;
367
+ ; XOP-LABEL: bitselect_v4i64_broadcast_rrm:
368
+ ; XOP: # %bb.0:
369
+ ; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
370
+ ; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
371
+ ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
372
+ ; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
373
+ ; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2
374
+ ; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
375
+ ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
376
+ ; XOP-NEXT: vandps %ymm3, %ymm0, %ymm0
377
+ ; XOP-NEXT: vandps %ymm2, %ymm1, %ymm1
378
+ ; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0
379
+ ; XOP-NEXT: retq
380
+ ;
381
+ ; AVX1-LABEL: bitselect_v4i64_broadcast_rrm:
382
+ ; AVX1: # %bb.0:
383
+ ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
384
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
385
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
386
+ ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
387
+ ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
388
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
389
+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
390
+ ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
391
+ ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
392
+ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
393
+ ; AVX1-NEXT: retq
394
+ ;
395
+ ; AVX2-LABEL: bitselect_v4i64_broadcast_rrm:
396
+ ; AVX2: # %bb.0:
397
+ ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
398
+ ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm3
399
+ ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
400
+ ; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2
401
+ ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2
402
+ ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
403
+ ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
404
+ ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
405
+ ; AVX2-NEXT: retq
406
+ ;
407
+ ; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm:
408
+ ; AVX512F: # %bb.0:
409
+ ; AVX512F-NEXT: movq (%rdi), %rax
410
+ ; AVX512F-NEXT: vmovq %rax, %xmm2
411
+ ; AVX512F-NEXT: vmovq %rax, %xmm3
412
+ ; AVX512F-NEXT: vpbroadcastq %xmm3, %ymm3
413
+ ; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
414
+ ; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
415
+ ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
416
+ ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
417
+ ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
418
+ ; AVX512F-NEXT: retq
419
+ %a2 = load i64 , i64* %p2
420
+ %1 = insertelement <4 x i64 > undef , i64 %a2 , i32 0
421
+ %2 = shufflevector <4 x i64 > %1 , <4 x i64 > undef , <4 x i32 > zeroinitializer
422
+ %3 = xor <4 x i64 > %1 , <i64 -1 , i64 undef , i64 undef , i64 undef >
423
+ %4 = shufflevector <4 x i64 > %3 , <4 x i64 > undef , <4 x i32 > zeroinitializer
424
+ %5 = and <4 x i64 > %a0 , %2
425
+ %6 = and <4 x i64 > %a1 , %4
426
+ %7 = or <4 x i64 > %5 , %6
427
+ ret <4 x i64 > %7
428
+ }
429
+
272
430
;
273
431
; 512-bit vectors
274
432
;
0 commit comments