Skip to content

Commit 307890f

Browse files
committed
[X86] Freeze vXi8 shl(x,1) -> add(x,x) vector fold (PR50468)
We don't have any vXi8 shift instructions (other than on XOP which is handled separately), so replace the shl(x,1) -> add(x,x) fold with shl(x,1) -> add(freeze(x),freeze(x)) to avoid the undef issues identified in PR50468. Split off from D106675 as I'm still looking at whether we can fix the vXi16/i32/i64 issues with the D106679 alternative. Differential Revision: https://reviews.llvm.org/D108139
1 parent a643bd3 commit 307890f

14 files changed

+476
-469
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28731,8 +28731,15 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
2873128731
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
2873228732

2873328733
// Simple i8 add case
28734-
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
28734+
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
28735+
// R may be undef at run-time, but (shl R, 1) must be an even number (LSB
28736+
// must be 0). (add undef, undef) however can be any value. To make this
28737+
// safe, we must freeze R to ensure that register allocation uses the same
28738+
// register for an undefined value. This ensures that the result will
28739+
// still be even and preserves the original semantics.
28740+
R = DAG.getNode(ISD::FREEZE, dl, VT, R);
2873528741
return DAG.getNode(ISD::ADD, dl, VT, R, R);
28742+
}
2873628743

2873728744
// ashr(R, 7) === cmp_slt(R, 0)
2873828745
if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

llvm/test/CodeGen/X86/bitreverse.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,11 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
6969
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
7070
; X64-NEXT: psrlw $2, %xmm0
7171
; X64-NEXT: por %xmm1, %xmm0
72-
; X64-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
72+
; X64-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
7373
; X64-NEXT: pand %xmm0, %xmm1
74-
; X64-NEXT: paddb %xmm1, %xmm1
74+
; X64-NEXT: psrlw $1, %xmm1
7575
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
76-
; X64-NEXT: psrlw $1, %xmm0
76+
; X64-NEXT: paddb %xmm0, %xmm0
7777
; X64-NEXT: por %xmm1, %xmm0
7878
; X64-NEXT: retq
7979
;

llvm/test/CodeGen/X86/combine-bitreverse.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,11 @@ define <4 x i32> @test_demandedbits_bitreverse(<4 x i32> %a0) nounwind {
6161
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
6262
; X86-NEXT: psrlw $2, %xmm0
6363
; X86-NEXT: por %xmm1, %xmm0
64-
; X86-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
64+
; X86-NEXT: movdqa {{.*#+}} xmm1 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
6565
; X86-NEXT: pand %xmm0, %xmm1
66-
; X86-NEXT: paddb %xmm1, %xmm1
66+
; X86-NEXT: psrlw $1, %xmm1
6767
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
68-
; X86-NEXT: psrlw $1, %xmm0
68+
; X86-NEXT: paddb %xmm0, %xmm0
6969
; X86-NEXT: por %xmm1, %xmm0
7070
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
7171
; X86-NEXT: retl

llvm/test/CodeGen/X86/vector-bitreverse.ll

Lines changed: 76 additions & 76 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/vector-fshl-rot-128.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -517,10 +517,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
517517
; SSE2-NEXT: pandn %xmm3, %xmm2
518518
; SSE2-NEXT: por %xmm4, %xmm2
519519
; SSE2-NEXT: movdqa %xmm2, %xmm3
520-
; SSE2-NEXT: paddb %xmm2, %xmm3
520+
; SSE2-NEXT: psrlw $7, %xmm3
521+
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
521522
; SSE2-NEXT: movdqa %xmm2, %xmm4
522-
; SSE2-NEXT: psrlw $7, %xmm4
523-
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
523+
; SSE2-NEXT: paddb %xmm2, %xmm4
524524
; SSE2-NEXT: por %xmm3, %xmm4
525525
; SSE2-NEXT: paddb %xmm1, %xmm1
526526
; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
@@ -553,10 +553,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
553553
; SSE41-NEXT: movdqa %xmm2, %xmm0
554554
; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
555555
; SSE41-NEXT: movdqa %xmm1, %xmm0
556-
; SSE41-NEXT: paddb %xmm1, %xmm0
556+
; SSE41-NEXT: psrlw $7, %xmm0
557+
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
557558
; SSE41-NEXT: movdqa %xmm1, %xmm3
558-
; SSE41-NEXT: psrlw $7, %xmm3
559-
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
559+
; SSE41-NEXT: paddb %xmm1, %xmm3
560560
; SSE41-NEXT: por %xmm0, %xmm3
561561
; SSE41-NEXT: paddb %xmm2, %xmm2
562562
; SSE41-NEXT: movdqa %xmm2, %xmm0
@@ -580,10 +580,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
580580
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
581581
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
582582
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
583-
; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
584-
; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3
585-
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
586-
; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
583+
; AVX-NEXT: vpsrlw $7, %xmm0, %xmm2
584+
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
585+
; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm3
586+
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
587587
; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
588588
; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
589589
; AVX-NEXT: retq
@@ -728,10 +728,10 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
728728
; X86-SSE2-NEXT: pandn %xmm3, %xmm2
729729
; X86-SSE2-NEXT: por %xmm4, %xmm2
730730
; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
731-
; X86-SSE2-NEXT: paddb %xmm2, %xmm3
731+
; X86-SSE2-NEXT: psrlw $7, %xmm3
732+
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
732733
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
733-
; X86-SSE2-NEXT: psrlw $7, %xmm4
734-
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
734+
; X86-SSE2-NEXT: paddb %xmm2, %xmm4
735735
; X86-SSE2-NEXT: por %xmm3, %xmm4
736736
; X86-SSE2-NEXT: paddb %xmm1, %xmm1
737737
; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0

llvm/test/CodeGen/X86/vector-fshl-rot-256.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -443,10 +443,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
443443
; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
444444
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
445445
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
446-
; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
447-
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
448-
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
449-
; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
446+
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
447+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
448+
; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
449+
; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
450450
; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
451451
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
452452
; AVX2-NEXT: retq
@@ -467,10 +467,10 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
467467
; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
468468
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
469469
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
470-
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
471-
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
472-
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
473-
; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
470+
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
471+
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
472+
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
473+
; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
474474
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
475475
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
476476
; AVX512F-NEXT: retq

llvm/test/CodeGen/X86/vector-fshl-rot-512.ll

Lines changed: 96 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -272,154 +272,154 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
272272
;
273273
; AVX512BW-LABEL: var_funnnel_v64i8:
274274
; AVX512BW: # %bb.0:
275-
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
276-
; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
277-
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
278-
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
279-
; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm2
280-
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
275+
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
276+
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
277+
; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
278+
; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
281279
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
282-
; AVX512BW-NEXT: vpmovb2m %zmm2, %k2
283-
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
284-
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
285-
; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
286-
; AVX512BW-NEXT: vpsrlw $2, %zmm2, %zmm5
287-
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
288-
; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
289-
; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm5
280+
; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
281+
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
282+
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
283+
; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
284+
; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5
290285
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
286+
; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
291287
; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
292288
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
293-
; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
294-
; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
289+
; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
290+
; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
291+
; AVX512BW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
292+
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
295293
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
296-
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
297-
; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
294+
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
295+
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
298296
; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
299-
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
297+
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
300298
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
301299
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
302-
; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm1
300+
; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
303301
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
304302
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
305-
; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
306-
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
307-
; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
308-
; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
303+
; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
304+
; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
305+
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
306+
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
307+
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
308+
; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
309309
; AVX512BW-NEXT: retq
310310
;
311311
; AVX512VLBW-LABEL: var_funnnel_v64i8:
312312
; AVX512VLBW: # %bb.0:
313-
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
314-
; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
315-
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
316-
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2
317-
; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm2
318-
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm4
313+
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
314+
; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm3
315+
; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
316+
; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
319317
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
320-
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k2
321-
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2
322-
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
323-
; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
324-
; AVX512VLBW-NEXT: vpsrlw $2, %zmm2, %zmm5
325-
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
326-
; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
327-
; AVX512VLBW-NEXT: vpsrlw $1, %zmm2, %zmm5
318+
; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
319+
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
320+
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
321+
; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
322+
; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5
328323
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
324+
; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
329325
; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
330326
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
331-
; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
332-
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm1, %zmm1
327+
; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
328+
; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
329+
; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm4, %zmm1
330+
; AVX512VLBW-NEXT: vpandq %zmm2, %zmm1, %zmm1
333331
; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
334-
; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm3
335-
; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k1
332+
; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
333+
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
336334
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
337-
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
335+
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
338336
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
339337
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
340-
; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm1
338+
; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
341339
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
342340
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
343-
; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm1
344-
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
345-
; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
346-
; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
341+
; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
342+
; AVX512VLBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
343+
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
344+
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
345+
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
346+
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
347347
; AVX512VLBW-NEXT: retq
348348
;
349349
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
350350
; AVX512VBMI2: # %bb.0:
351-
; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
352-
; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
353-
; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
354-
; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
355-
; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
356-
; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
351+
; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
352+
; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
353+
; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
354+
; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
357355
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
358-
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k2
359-
; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
360-
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
361-
; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
362-
; AVX512VBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
363-
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
364-
; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
365-
; AVX512VBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
356+
; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
357+
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
358+
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
359+
; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
360+
; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
366361
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
362+
; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
367363
; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
368364
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
369-
; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
370-
; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
365+
; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
366+
; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
367+
; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1
368+
; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
371369
; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
372-
; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
373-
; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k1
370+
; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
371+
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
374372
; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2
375-
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
373+
; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
376374
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
377375
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
378-
; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
376+
; AVX512VBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
379377
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
380378
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
381-
; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
382-
; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
383-
; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
384-
; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
379+
; AVX512VBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
380+
; AVX512VBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
381+
; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
382+
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
383+
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
384+
; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
385385
; AVX512VBMI2-NEXT: retq
386386
;
387387
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
388388
; AVX512VLVBMI2: # %bb.0:
389-
; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
390-
; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
391-
; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
392-
; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
393-
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
394-
; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
389+
; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
390+
; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
391+
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
392+
; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
395393
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
396-
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k2
397-
; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
398-
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
399-
; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
400-
; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
401-
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
402-
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
403-
; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
394+
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
395+
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
396+
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
397+
; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
398+
; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
404399
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5
400+
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
405401
; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
406402
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
407-
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
408-
; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
403+
; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
404+
; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
405+
; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1
406+
; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
409407
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
410-
; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
411-
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k1
408+
; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
409+
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
412410
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2
413-
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
411+
; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
414412
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
415413
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
416-
; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
414+
; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
417415
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
418416
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
419-
; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
420-
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
421-
; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
422-
; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
417+
; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
418+
; AVX512VLVBMI2-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
419+
; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
420+
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
421+
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
422+
; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
423423
; AVX512VLVBMI2-NEXT: retq
424424
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
425425
ret <64 x i8> %res

0 commit comments

Comments
 (0)