Skip to content

Commit d2057a8

Browse files
committed
[X86][AVX] Lower v16i8/v8i16 binary shuffles using VTRUNC/TRUNCATE
This patch adds lowerShuffleWithVTRUNC to handle basic binary shuffles that can be lowered either as a pure ISD::TRUNCATE or a X86ISD::VTRUNC (with undef/zero values in the remaining upper elements). We concat the binary sources together into a single 256-bit source vector. To avoid regressions we perform this after we've tried to lower with PACKS/PACKUS which typically does a cleaner job than a concat. For non-AVX512VL cases we have to canonicalize VTRUNC cases to use a 512-bit source vectors (inserting undefs/zeros in the upper elements as necessary), truncate and then (possibly) extract the 128-bit result. This should address the last regressions in D66004 Differential Revision: https://reviews.llvm.org/D86093
1 parent eaff200 commit d2057a8

File tree

5 files changed

+385
-309
lines changed

5 files changed

+385
-309
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 84 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11325,17 +11325,15 @@ static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
1132511325
//
1132611326
// But when avx512vl is available, one can just use a single vpmovdw
1132711327
// instruction.
11328-
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
11329-
MVT VT, SDValue V1, SDValue V2,
11330-
SelectionDAG &DAG,
11331-
const X86Subtarget &Subtarget) {
11328+
// TODO: Merge with lowerShuffleAsVTRUNC.
11329+
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11330+
SDValue V2, ArrayRef<int> Mask,
11331+
const X86Subtarget &Subtarget,
11332+
SelectionDAG &DAG) {
1133211333
assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11333-
11334-
if (Mask.size() != VT.getVectorNumElements())
11335-
return SDValue();
11336-
1133711334
bool SwappedOps = false;
1133811335

11336+
// TODO: Convert to use Zeroable bitmask.
1133911337
if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
1134011338
if (!ISD::isBuildVectorAllZeros(V1.getNode()))
1134111339
return SDValue();
@@ -11378,6 +11376,73 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
1137811376
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
1137911377
}
1138011378

11379+
// Attempt to match binary shuffle patterns as a truncate.
11380+
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11381+
SDValue V2, ArrayRef<int> Mask,
11382+
const APInt &Zeroable,
11383+
const X86Subtarget &Subtarget,
11384+
SelectionDAG &DAG) {
11385+
assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
11386+
if (!Subtarget.hasAVX512())
11387+
return SDValue();
11388+
11389+
unsigned NumElts = VT.getVectorNumElements();
11390+
unsigned EltSizeInBits = VT.getScalarSizeInBits();
11391+
unsigned MaxScale = 64 / VT.getScalarSizeInBits();
11392+
for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11393+
// TODO: Support non-BWI VPMOVWB truncations?
11394+
unsigned SrcEltBits = EltSizeInBits * Scale;
11395+
if (SrcEltBits < 32 && !Subtarget.hasBWI())
11396+
continue;
11397+
11398+
// Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11399+
// Bail if the V2 elements are undef.
11400+
unsigned NumHalfSrcElts = NumElts / Scale;
11401+
unsigned NumSrcElts = 2 * NumHalfSrcElts;
11402+
if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11403+
isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11404+
continue;
11405+
11406+
// The elements beyond the truncation must be undef/zero.
11407+
unsigned UpperElts = NumElts - NumSrcElts;
11408+
if (UpperElts > 0 &&
11409+
!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11410+
continue;
11411+
11412+
// As we're using both sources then we need to concat them together
11413+
// and truncate from the 256-bit src.
11414+
MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11415+
SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11416+
11417+
MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11418+
MVT SrcVT = MVT::getVectorVT(SrcSVT, 256 / SrcEltBits);
11419+
Src = DAG.getBitcast(SrcVT, Src);
11420+
11421+
if (SrcVT.getVectorNumElements() == NumElts)
11422+
return DAG.getNode(ISD::TRUNCATE, DL, VT, Src);
11423+
11424+
if (!Subtarget.hasVLX()) {
11425+
// Non-VLX targets must truncate from a 512-bit type, so we need to
11426+
// widen, truncate and then possibly extract the original 128-bit
11427+
// vector.
11428+
bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11429+
Src = widenSubVector(Src, !UndefUppers, Subtarget, DAG, DL, 512);
11430+
unsigned NumWideSrcElts = Src.getValueType().getVectorNumElements();
11431+
if (NumWideSrcElts >= NumElts) {
11432+
// Widening means we can now use a regular TRUNCATE.
11433+
MVT WideVT = MVT::getVectorVT(VT.getScalarType(), NumWideSrcElts);
11434+
SDValue WideRes = DAG.getNode(ISD::TRUNCATE, DL, WideVT, Src);
11435+
if (!WideVT.is128BitVector())
11436+
WideRes = extract128BitVector(WideRes, 0, DAG, DL);
11437+
return WideRes;
11438+
}
11439+
}
11440+
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
11441+
}
11442+
11443+
return SDValue();
11444+
}
11445+
1138111446
/// Check whether a compaction lowering can be done by dropping even
1138211447
/// elements and compute how many times even elements must be dropped.
1138311448
///
@@ -14733,7 +14798,7 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1473314798

1473414799
// Try to use lower using a truncation.
1473514800
if (SDValue V =
14736-
lowerShuffleWithVPMOV(DL, Mask, MVT::v8i16, V1, V2, DAG, Subtarget))
14801+
lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
1473714802
return V;
1473814803

1473914804
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
@@ -14816,6 +14881,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1481614881
Subtarget))
1481714882
return V;
1481814883

14884+
// Try to use lower using a truncation.
14885+
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14886+
Subtarget, DAG))
14887+
return V;
14888+
1481914889
// Try to use byte rotation instructions.
1482014890
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
1482114891
Subtarget, DAG))
@@ -14922,7 +14992,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1492214992

1492314993
// Try to use lower using a truncation.
1492414994
if (SDValue V =
14925-
lowerShuffleWithVPMOV(DL, Mask, MVT::v16i8, V1, V2, DAG, Subtarget))
14995+
lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14996+
return V;
14997+
14998+
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14999+
Subtarget, DAG))
1492615000
return V;
1492715001

1492815002
// See if we can use SSE4A Extraction / Insertion.

llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll

Lines changed: 56 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,10 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
4242
;
4343
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
4444
; AVX512BW: # %bb.0:
45-
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
46-
; AVX512BW-NEXT: vpand 16(%rdi), %xmm0, %xmm1
47-
; AVX512BW-NEXT: vpand (%rdi), %xmm0, %xmm0
48-
; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
45+
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
46+
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4947
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
48+
; AVX512BW-NEXT: vzeroupper
5049
; AVX512BW-NEXT: retq
5150
;
5251
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
@@ -143,11 +142,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
143142
;
144143
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
145144
; AVX512F: # %bb.0:
146-
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
147-
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
148-
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
149-
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
145+
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
146+
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
150147
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
148+
; AVX512F-NEXT: vzeroupper
151149
; AVX512F-NEXT: retq
152150
;
153151
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
@@ -159,11 +157,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
159157
;
160158
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
161159
; AVX512BW: # %bb.0:
162-
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
163-
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
164-
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
165-
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
160+
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
161+
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
166162
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
163+
; AVX512BW-NEXT: vzeroupper
167164
; AVX512BW-NEXT: retq
168165
;
169166
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
@@ -377,54 +374,42 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
377374
;
378375
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
379376
; AVX512F: # %bb.0:
380-
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
381-
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
382-
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
383-
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
384-
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
385-
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
377+
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
378+
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
386379
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
380+
; AVX512F-NEXT: vzeroupper
387381
; AVX512F-NEXT: retq
388382
;
389383
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
390384
; AVX512VL: # %bb.0:
391-
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
392-
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
393-
; AVX512VL-NEXT: vpmovdb %xmm1, %xmm1
394-
; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
395-
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
385+
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
386+
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
396387
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
388+
; AVX512VL-NEXT: vzeroupper
397389
; AVX512VL-NEXT: retq
398390
;
399391
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
400392
; AVX512BW: # %bb.0:
401-
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
402-
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
403-
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
404-
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
405-
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
406-
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
393+
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
394+
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
407395
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
396+
; AVX512BW-NEXT: vzeroupper
408397
; AVX512BW-NEXT: retq
409398
;
410399
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
411400
; AVX512BWVL: # %bb.0:
412-
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
413-
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
414-
; AVX512BWVL-NEXT: vpmovdb %xmm1, %xmm1
415-
; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0
416-
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
401+
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
402+
; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
417403
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
404+
; AVX512BWVL-NEXT: vzeroupper
418405
; AVX512BWVL-NEXT: retq
419406
;
420407
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
421408
; AVX512VBMIVL: # %bb.0:
422-
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
423-
; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
424-
; AVX512VBMIVL-NEXT: vpmovdb %xmm1, %xmm1
425-
; AVX512VBMIVL-NEXT: vpmovdb %xmm0, %xmm0
426-
; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
409+
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
410+
; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
427411
; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
412+
; AVX512VBMIVL-NEXT: vzeroupper
428413
; AVX512VBMIVL-NEXT: retq
429414
%vec = load <32 x i8>, <32 x i8>* %L
430415
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -1081,49 +1066,42 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
10811066
;
10821067
; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
10831068
; AVX512F: # %bb.0:
1084-
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
1085-
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1086-
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1087-
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1088-
; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1069+
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1070+
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
10891071
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
1072+
; AVX512F-NEXT: vzeroupper
10901073
; AVX512F-NEXT: retq
10911074
;
10921075
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
10931076
; AVX512VL: # %bb.0:
1094-
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
1095-
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
1096-
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
1097-
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1098-
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1099-
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1077+
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1078+
; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
11001079
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
1080+
; AVX512VL-NEXT: vzeroupper
11011081
; AVX512VL-NEXT: retq
11021082
;
11031083
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
11041084
; AVX512BW: # %bb.0:
1105-
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
1106-
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1107-
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
1108-
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1109-
; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1085+
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1086+
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
11101087
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
1088+
; AVX512BW-NEXT: vzeroupper
11111089
; AVX512BW-NEXT: retq
11121090
;
11131091
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
11141092
; AVX512BWVL: # %bb.0:
1115-
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
1116-
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
1117-
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
1118-
; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
1093+
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1094+
; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
1095+
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
1096+
; AVX512BWVL-NEXT: vzeroupper
11191097
; AVX512BWVL-NEXT: retq
11201098
;
11211099
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
11221100
; AVX512VBMIVL: # %bb.0:
1123-
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
1124-
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
1125-
; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
1126-
; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
1101+
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1102+
; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
1103+
; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
1104+
; AVX512VBMIVL-NEXT: vzeroupper
11271105
; AVX512VBMIVL-NEXT: retq
11281106
%vec = load <16 x i16>, <16 x i16>* %L
11291107
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1199,54 +1177,42 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
11991177
;
12001178
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
12011179
; AVX512F: # %bb.0:
1202-
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
1203-
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
1204-
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1205-
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1206-
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1207-
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1180+
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1181+
; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
12081182
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1183+
; AVX512F-NEXT: vzeroupper
12091184
; AVX512F-NEXT: retq
12101185
;
12111186
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
12121187
; AVX512VL: # %bb.0:
1213-
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
1214-
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
1215-
; AVX512VL-NEXT: vpmovqb %xmm1, %xmm1
1216-
; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0
1217-
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1188+
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1189+
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
12181190
; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
1191+
; AVX512VL-NEXT: vzeroupper
12191192
; AVX512VL-NEXT: retq
12201193
;
12211194
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
12221195
; AVX512BW: # %bb.0:
1223-
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1224-
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
1225-
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1226-
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1227-
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1228-
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1196+
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1197+
; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
12291198
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1199+
; AVX512BW-NEXT: vzeroupper
12301200
; AVX512BW-NEXT: retq
12311201
;
12321202
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
12331203
; AVX512BWVL: # %bb.0:
1234-
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
1235-
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
1236-
; AVX512BWVL-NEXT: vpmovqb %xmm1, %xmm1
1237-
; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0
1238-
; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1204+
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1205+
; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
12391206
; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
1207+
; AVX512BWVL-NEXT: vzeroupper
12401208
; AVX512BWVL-NEXT: retq
12411209
;
12421210
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
12431211
; AVX512VBMIVL: # %bb.0:
1244-
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
1245-
; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
1246-
; AVX512VBMIVL-NEXT: vpmovqb %xmm1, %xmm1
1247-
; AVX512VBMIVL-NEXT: vpmovqb %xmm0, %xmm0
1248-
; AVX512VBMIVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1212+
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
1213+
; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
12491214
; AVX512VBMIVL-NEXT: vmovd %xmm0, (%rsi)
1215+
; AVX512VBMIVL-NEXT: vzeroupper
12501216
; AVX512VBMIVL-NEXT: retq
12511217
%vec = load <32 x i8>, <32 x i8>* %L
12521218
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>

0 commit comments

Comments
 (0)