Skip to content

Commit 13d3fa8

Browse files
committed
[X86] Attempt to lower 512-bit shuffle(x,y) as concat(truncate(x),truncate(y))
Avoid lowering to costly VPERMV3 v32i16/v64i8 shuffles if can perform this as a pair of (maybe offset) truncations instead. This exposed an existing issue that we weren't splitting AVX512 stores of concat(truncate(x),truncate(y)) to allow us to use truncstores instead.
1 parent 1b18ce5 commit 13d3fa8

File tree

6 files changed

+148
-327
lines changed

6 files changed

+148
-327
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15952,6 +15952,54 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
1595215952
DAG.getIntPtrConstant(0, DL));
1595315953
}
1595415954

15955+
// Match truncation of both 512-bit operands and concat results together.
15956+
// TODO: Similar to lowerShuffleAsVTRUNC - merge or share matching code?
15957+
static SDValue lowerShuffleAsVTRUNCAndConcat(
15958+
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15959+
const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15960+
assert(VT.is512BitVector() && VT.getScalarSizeInBits() < 64 &&
15961+
"Unexpected type!");
15962+
if (!Subtarget.hasAVX512())
15963+
return SDValue();
15964+
15965+
unsigned NumElts = VT.getVectorNumElements();
15966+
unsigned DstSizeInBits = VT.getScalarSizeInBits();
15967+
unsigned SrcSizeInBits = DstSizeInBits * 2;
15968+
15969+
// TODO: Support non-BWI VPMOVWB truncations?
15970+
if (SrcSizeInBits < 32 && !Subtarget.hasBWI())
15971+
return SDValue();
15972+
15973+
// Match shuffle <Ofs,Ofs+2,Ofs+4,..>
15974+
// TODO: Handle general Scale factors with undef/zero upper elements.
15975+
for (unsigned Offset = 0; Offset != 2; ++Offset) {
15976+
if (!isSequentialOrUndefInRange(Mask, 0, NumElts, Offset, 2))
15977+
continue;
15978+
15979+
MVT DstVT = MVT::getVectorVT(VT.getScalarType(), NumElts / 2);
15980+
MVT SrcSVT = MVT::getIntegerVT(SrcSizeInBits);
15981+
MVT SrcVT = MVT::getVectorVT(SrcSVT, NumElts / 2);
15982+
15983+
V1 = DAG.getBitcast(SrcVT, V1);
15984+
V2 = DAG.getBitcast(SrcVT, V2);
15985+
15986+
if (Offset) {
15987+
V1 = DAG.getNode(
15988+
X86ISD::VSRLI, DL, SrcVT, V1,
15989+
DAG.getTargetConstant(Offset * DstSizeInBits, DL, MVT::i8));
15990+
V2 = DAG.getNode(
15991+
X86ISD::VSRLI, DL, SrcVT, V2,
15992+
DAG.getTargetConstant(Offset * DstSizeInBits, DL, MVT::i8));
15993+
}
15994+
15995+
V1 = DAG.getNode(ISD::TRUNCATE, DL, DstVT, V1);
15996+
V2 = DAG.getNode(ISD::TRUNCATE, DL, DstVT, V2);
15997+
return concatSubVectors(V1, V2, DAG, DL);
15998+
}
15999+
16000+
return SDValue();
16001+
}
16002+
1595516003
// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
1595616004
// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
1595716005
// =>
@@ -17312,6 +17360,10 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1731217360
Zeroable, Subtarget, DAG))
1731317361
return PSHUFB;
1731417362

17363+
if (SDValue Trunc = lowerShuffleAsVTRUNCAndConcat(
17364+
DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17365+
return Trunc;
17366+
1731517367
return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
1731617368
}
1731717369

@@ -17367,6 +17419,10 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1736717419
Zeroable, Subtarget, DAG))
1736817420
return PSHUFB;
1736917421

17422+
if (SDValue Trunc = lowerShuffleAsVTRUNCAndConcat(
17423+
DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17424+
return Trunc;
17425+
1737017426
// Try to create an in-lane repeating shuffle mask and then shuffle the
1737117427
// results into the target lanes.
1737217428
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -52615,6 +52671,17 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
5261552671
return splitVectorStore(St, DAG);
5261652672
}
5261752673

52674+
// Split a concatenation of truncations to fold to truncating stores.
52675+
if (VT.is512BitVector() && Subtarget.hasAVX512() && StVT == VT &&
52676+
StoredVal.hasOneUse()) {
52677+
SmallVector<SDValue> Ops;
52678+
if (collectConcatOps(StoredVal.getNode(), Ops, DAG) &&
52679+
all_of(Ops, [&](SDValue Op) {
52680+
return Op.getOpcode() == ISD::TRUNCATE && Op.hasOneUse();
52681+
}))
52682+
return splitVectorStore(St, DAG);
52683+
}
52684+
5261852685
// Split under-aligned vector non-temporal stores.
5261952686
if (St->isNonTemporal() && StVT == VT &&
5262052687
St->getAlign().value() < VT.getStoreSize()) {

llvm/test/CodeGen/X86/vec_smulo.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2638,14 +2638,14 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
26382638
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1
26392639
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
26402640
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
2641+
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
2642+
; AVX512F-NEXT: vpmovdb %zmm6, 16(%rdi)
2643+
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
2644+
; AVX512F-NEXT: vpmovdb %zmm6, (%rdi)
26412645
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
26422646
; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi)
26432647
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
26442648
; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi)
2645-
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
2646-
; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi)
2647-
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
2648-
; AVX512F-NEXT: vpmovdb %zmm4, (%rdi)
26492649
; AVX512F-NEXT: retq
26502650
;
26512651
; AVX512BW-LABEL: smulo_v64i8:

llvm/test/CodeGen/X86/vec_umulo.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2301,14 +2301,14 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
23012301
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k3} {z} = -1
23022302
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
23032303
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
2304+
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
2305+
; AVX512F-NEXT: vpmovdb %zmm6, 16(%rdi)
2306+
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
2307+
; AVX512F-NEXT: vpmovdb %zmm6, (%rdi)
23042308
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
23052309
; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi)
23062310
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
23072311
; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi)
2308-
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
2309-
; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi)
2310-
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
2311-
; AVX512F-NEXT: vpmovdb %zmm4, (%rdi)
23122312
; AVX512F-NEXT: retq
23132313
;
23142314
; AVX512BW-LABEL: umulo_v64i8:

0 commit comments

Comments
 (0)