Skip to content

[X86] Attempt to lower 512-bit shuffle(x,y) as concat(truncate(x),truncate(y)) #117579

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15952,6 +15952,54 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
DAG.getIntPtrConstant(0, DL));
}

// Match truncation of both 512-bit operands and concat results together.
// TODO: Similar to lowerShuffleAsVTRUNC - merge or share matching code?
static SDValue lowerShuffleAsVTRUNCAndConcat(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(VT.is512BitVector() && VT.getScalarSizeInBits() < 64 &&
"Unexpected type!");
if (!Subtarget.hasAVX512())
return SDValue();

unsigned NumElts = VT.getVectorNumElements();
unsigned DstSizeInBits = VT.getScalarSizeInBits();
unsigned SrcSizeInBits = DstSizeInBits * 2;

// TODO: Support non-BWI VPMOVWB truncations?
if (SrcSizeInBits < 32 && !Subtarget.hasBWI())
return SDValue();

// Match shuffle <Ofs,Ofs+2,Ofs+4,..>
// TODO: Handle general Scale factors with undef/zero upper elements.
for (unsigned Offset = 0; Offset != 2; ++Offset) {
if (!isSequentialOrUndefInRange(Mask, 0, NumElts, Offset, 2))
continue;

MVT DstVT = MVT::getVectorVT(VT.getScalarType(), NumElts / 2);
MVT SrcSVT = MVT::getIntegerVT(SrcSizeInBits);
MVT SrcVT = MVT::getVectorVT(SrcSVT, NumElts / 2);

V1 = DAG.getBitcast(SrcVT, V1);
V2 = DAG.getBitcast(SrcVT, V2);

if (Offset) {
V1 = DAG.getNode(
X86ISD::VSRLI, DL, SrcVT, V1,
DAG.getTargetConstant(Offset * DstSizeInBits, DL, MVT::i8));
V2 = DAG.getNode(
X86ISD::VSRLI, DL, SrcVT, V2,
DAG.getTargetConstant(Offset * DstSizeInBits, DL, MVT::i8));
}

V1 = DAG.getNode(ISD::TRUNCATE, DL, DstVT, V1);
V2 = DAG.getNode(ISD::TRUNCATE, DL, DstVT, V2);
return concatSubVectors(V1, V2, DAG, DL);
}

return SDValue();
}

// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
// =>
Expand Down Expand Up @@ -17312,6 +17360,10 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;

if (SDValue Trunc = lowerShuffleAsVTRUNCAndConcat(
DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Trunc;

return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
}

Expand Down Expand Up @@ -17367,6 +17419,10 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;

if (SDValue Trunc = lowerShuffleAsVTRUNCAndConcat(
DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Trunc;

// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
Expand Down Expand Up @@ -52615,6 +52671,17 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
return splitVectorStore(St, DAG);
}

// Split a concatenation of truncations to fold to truncating stores.
if (VT.is512BitVector() && Subtarget.hasAVX512() && StVT == VT &&
StoredVal.hasOneUse()) {
SmallVector<SDValue> Ops;
if (collectConcatOps(StoredVal.getNode(), Ops, DAG) &&
all_of(Ops, [&](SDValue Op) {
return Op.getOpcode() == ISD::TRUNCATE && Op.hasOneUse();
}))
return splitVectorStore(St, DAG);
}

// Split under-aligned vector non-temporal stores.
if (St->isNonTemporal() && StVT == VT &&
St->getAlign().value() < VT.getStoreSize()) {
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/vec_smulo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2638,14 +2638,14 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
; AVX512F-NEXT: vpmovdb %zmm6, 16(%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
; AVX512F-NEXT: vpmovdb %zmm6, (%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
; AVX512F-NEXT: vpmovdb %zmm4, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: smulo_v64i8:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/vec_umulo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2301,14 +2301,14 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
; AVX512F-NEXT: vpmovdb %zmm6, 16(%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
; AVX512F-NEXT: vpmovdb %zmm6, (%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
; AVX512F-NEXT: vpmovdb %zmm4, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: umulo_v64i8:
Expand Down
Loading
Loading