Skip to content

Commit 3daf2fb

Browse files
committed
[X86] combineX86ShuffleChainWithExtract - refactor to remove need to widen all vectors
combineX86ShuffleChain no longer needs the candidate shuffle inputs to all have been widened to root size, but combineX86ShuffleChainWithExtract still assumes that they have. This refactor peeks through all inputs' extract_subvector nodes to find the widest legal vector type and widens the shuffle mask accordingly - it no longer bails if any of the inputs can't be widened to the new width. combineX86ShuffleChainWithExtract now follows a similar process to combineX86ShufflesRecursively, so I think we should eventually be able to get rid of it and just let combineX86ShufflesRecursively handle widening the shuffle width as well. Another step toward removing subvector widening for Issue llvm#45319
1 parent 9bd58f6 commit 3daf2fb

8 files changed

+1968
-1986
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 68 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -39733,84 +39733,102 @@ static SDValue combineX86ShuffleChainWithExtract(
3973339733

3973439734
EVT RootVT = Root.getValueType();
3973539735
unsigned RootSizeInBits = RootVT.getSizeInBits();
39736+
unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
3973639737
assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
3973739738

39738-
// Bail if we have any smaller inputs.
39739-
if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {
39740-
return Input.getValueSizeInBits() < RootSizeInBits;
39741-
}))
39742-
return SDValue();
39743-
39744-
SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
39745-
SmallVector<unsigned, 4> Offsets(NumInputs, 0);
39746-
39747-
// Peek through subvectors.
39748-
// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
39739+
// Peek through extract_subvector to find widest legal vector.
39740+
// TODO: Handle ISD::TRUNCATE
3974939741
unsigned WideSizeInBits = RootSizeInBits;
39750-
for (unsigned i = 0; i != NumInputs; ++i) {
39751-
SDValue &Src = WideInputs[i];
39752-
unsigned &Offset = Offsets[i];
39753-
Src = peekThroughBitcasts(Src);
39754-
EVT BaseVT = Src.getValueType();
39755-
while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
39756-
Offset += Src.getConstantOperandVal(1);
39757-
Src = Src.getOperand(0);
39758-
}
39759-
WideSizeInBits = std::max(WideSizeInBits,
39760-
(unsigned)Src.getValueSizeInBits());
39761-
assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
39762-
"Unexpected subvector extraction");
39763-
Offset /= BaseVT.getVectorNumElements();
39764-
Offset *= NumMaskElts;
39742+
for (unsigned I = 0; I != NumInputs; ++I) {
39743+
SDValue Input = peekThroughBitcasts(Inputs[I]);
39744+
while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
39745+
Input = peekThroughBitcasts(Input.getOperand(0));
39746+
if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
39747+
WideSizeInBits < Input.getValueSizeInBits())
39748+
WideSizeInBits = Input.getValueSizeInBits();
3976539749
}
3976639750

39767-
// Bail if we're always extracting from the lowest subvectors,
39768-
// combineX86ShuffleChain should match this for the current width.
39769-
if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
39770-
return SDValue();
39771-
39751+
// Bail if we fail to find a source larger than the existing root.
3977239752
unsigned Scale = WideSizeInBits / RootSizeInBits;
39773-
assert((WideSizeInBits % RootSizeInBits) == 0 &&
39774-
"Unexpected subvector extraction");
39775-
39776-
// If the src vector types aren't the same, see if we can extend
39777-
// them to match each other.
39778-
// TODO: Support different scalar types?
39779-
EVT WideSVT = WideInputs[0].getValueType().getScalarType();
39780-
if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
39781-
return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
39782-
Op.getValueType().getScalarType() != WideSVT;
39783-
}))
39753+
if (WideSizeInBits <= RootSizeInBits ||
39754+
(WideSizeInBits % RootSizeInBits) != 0)
3978439755
return SDValue();
3978539756

3978639757
// Create new mask for larger type.
39787-
for (unsigned i = 1; i != NumInputs; ++i)
39788-
Offsets[i] += i * Scale * NumMaskElts;
39789-
3979039758
SmallVector<int, 64> WideMask(BaseMask);
3979139759
for (int &M : WideMask) {
3979239760
if (M < 0)
3979339761
continue;
39794-
M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
39762+
M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
3979539763
}
3979639764
WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
3979739765

39766+
// Attempt to peek through inputs and adjust mask when we extract from an
39767+
// upper subvector.
39768+
int AdjustedMasks = 0;
39769+
SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
39770+
for (unsigned I = 0; I != NumInputs; ++I) {
39771+
SDValue &Input = WideInputs[I];
39772+
Input = peekThroughBitcasts(Input);
39773+
while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39774+
Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
39775+
uint64_t Idx = Input.getConstantOperandVal(1);
39776+
if (Idx != 0) {
39777+
++AdjustedMasks;
39778+
unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
39779+
Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
39780+
39781+
int lo = I * WideMask.size();
39782+
int hi = (I + 1) * WideMask.size();
39783+
for (int &M : WideMask)
39784+
if (lo <= M && M < hi)
39785+
M += Idx;
39786+
}
39787+
Input = peekThroughBitcasts(Input.getOperand(0));
39788+
}
39789+
}
39790+
3979839791
// Remove unused/repeated shuffle source ops.
3979939792
resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
3980039793
assert(!WideInputs.empty() && "Shuffle with no inputs detected");
3980139794

39802-
if (WideInputs.size() > 2)
39803-
return SDValue();
39795+
// Bail if we're always extracting from the lowest subvectors,
39796+
// combineX86ShuffleChain should match this for the current width, or the
39797+
// shuffle still references too many inputs.
39798+
if (AdjustedMasks == 0 || WideInputs.size() > 2)
39799+
return SDValue();
39800+
39801+
// Minor canonicalization of the accumulated shuffle mask to make it easier
39802+
// to match below. All this does is detect masks with sequential pairs of
39803+
// elements, and shrink them to the half-width mask. It does this in a loop
39804+
// so it will reduce the size of the mask to the minimal width mask which
39805+
// performs an equivalent shuffle.
39806+
while (WideMask.size() > 1) {
39807+
SmallVector<int, 64> WidenedMask;
39808+
if (!canWidenShuffleElements(WideMask, WidenedMask))
39809+
break;
39810+
WideMask = std::move(WidenedMask);
39811+
}
39812+
39813+
// Canonicalization of binary shuffle masks to improve pattern matching by
39814+
// commuting the inputs.
39815+
if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
39816+
ShuffleVectorSDNode::commuteMask(WideMask);
39817+
std::swap(WideInputs[0], WideInputs[1]);
39818+
}
3980439819

3980539820
// Increase depth for every upper subvector we've peeked through.
39806-
Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
39821+
Depth += AdjustedMasks;
3980739822

3980839823
// Attempt to combine wider chain.
3980939824
// TODO: Can we use a better Root?
3981039825
SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
3981139826
WideInputs.back().getValueSizeInBits()
3981239827
? WideInputs.front()
3981339828
: WideInputs.back();
39829+
assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
39830+
"WideRootSize mismatch");
39831+
3981439832
if (SDValue WideShuffle =
3981539833
combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
3981639834
HasVariableMask, AllowVariableCrossLaneMask,
@@ -39819,6 +39837,7 @@ static SDValue combineX86ShuffleChainWithExtract(
3981939837
extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
3982039838
return DAG.getBitcast(RootVT, WideShuffle);
3982139839
}
39840+
3982239841
return SDValue();
3982339842
}
3982439843

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -750,15 +750,9 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
750750
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
751751
; AVX512BW-SLOW: # %bb.0:
752752
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
753+
; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
753754
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
754-
; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,4,5,6,7]
755-
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
756-
; AVX512BW-SLOW-NEXT: vmovd %xmm0, %eax
757-
; AVX512BW-SLOW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
758-
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
759-
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
760-
; AVX512BW-SLOW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
761-
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
755+
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
762756
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
763757
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
764758
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -767,16 +761,13 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
767761
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
768762
; AVX512BW-FAST: # %bb.0:
769763
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
764+
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
770765
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
771-
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,0,3,4,5,6,7]
772766
; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
773-
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
774-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
775767
; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
776-
; AVX512BW-FAST-NEXT: vpinsrw $4, %eax, %xmm1, %xmm0
777-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7]
778-
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
779-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
768+
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
769+
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
770+
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
780771
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
781772
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
782773
; AVX512BW-FAST-NEXT: vzeroupper
@@ -877,8 +868,8 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
877868
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
878869
; AVX512BW-SLOW: # %bb.0:
879870
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
880-
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
881871
; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
872+
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
882873
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
883874
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
884875
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
@@ -890,8 +881,8 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
890881
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
891882
; AVX512BW-FAST: # %bb.0:
892883
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
893-
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
894884
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
885+
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
895886
; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
896887
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
897888
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
@@ -2040,11 +2031,10 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
20402031
; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
20412032
; AVX512BW-FAST: # %bb.0:
20422033
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
2034+
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31]
20432035
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
2044-
; AVX512BW-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2045-
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,0,11,0,13,0,15]
2046-
; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
2047-
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
2036+
; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm0
2037+
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
20482038
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
20492039
; AVX512BW-FAST-NEXT: vzeroupper
20502040
; AVX512BW-FAST-NEXT: retq

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -643,30 +643,19 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
643643
;
644644
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
645645
; AVX512BW-SLOW: # %bb.0:
646-
; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,4,5,6,7]
646+
; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
647647
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
648-
; AVX512BW-SLOW-NEXT: movl (%rdi), %eax
649-
; AVX512BW-SLOW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
650-
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
651-
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
652-
; AVX512BW-SLOW-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
653-
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
654648
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
655649
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
656650
; AVX512BW-SLOW-NEXT: vzeroupper
657651
; AVX512BW-SLOW-NEXT: retq
658652
;
659653
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
660654
; AVX512BW-FAST: # %bb.0:
661-
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,3,4,5,6,7]
655+
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
662656
; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
663-
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
664-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
665-
; AVX512BW-FAST-NEXT: movl (%rdi), %eax
666-
; AVX512BW-FAST-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
667-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
668-
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
669-
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
657+
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
658+
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
670659
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
671660
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
672661
; AVX512BW-FAST-NEXT: vzeroupper

llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2691,26 +2691,40 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64>
26912691
}
26922692

26932693
define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
2694-
; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2695-
; CHECK: # %bb.0:
2696-
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4,1]
2697-
; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0
2698-
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2699-
; CHECK-NEXT: vzeroupper
2700-
; CHECK-NEXT: retq
2694+
; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2695+
; CHECK-FAST: # %bb.0:
2696+
; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [4,1]
2697+
; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0
2698+
; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2699+
; CHECK-FAST-NEXT: vzeroupper
2700+
; CHECK-FAST-NEXT: retq
2701+
;
2702+
; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2703+
; CHECK-FAST-PERLANE: # %bb.0:
2704+
; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0
2705+
; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
2706+
; CHECK-FAST-PERLANE-NEXT: retq
27012707
%vec = load <8 x i64>, ptr %vp
27022708
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
27032709
ret <2 x i64> %res
27042710
}
27052711
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2706-
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2707-
; CHECK: # %bb.0:
2708-
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1]
2709-
; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm2
2710-
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2711-
; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2712-
; CHECK-NEXT: vzeroupper
2713-
; CHECK-NEXT: retq
2712+
; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2713+
; CHECK-FAST: # %bb.0:
2714+
; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1]
2715+
; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm2
2716+
; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1
2717+
; CHECK-FAST-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2718+
; CHECK-FAST-NEXT: vzeroupper
2719+
; CHECK-FAST-NEXT: retq
2720+
;
2721+
; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2722+
; CHECK-FAST-PERLANE: # %bb.0:
2723+
; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2
2724+
; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
2725+
; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1
2726+
; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2727+
; CHECK-FAST-PERLANE-NEXT: retq
27142728
%vec = load <8 x i64>, ptr %vp
27152729
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
27162730
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -2719,14 +2733,22 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %
27192733
}
27202734

27212735
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
2722-
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2723-
; CHECK: # %bb.0:
2724-
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1]
2725-
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2726-
; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
2727-
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2728-
; CHECK-NEXT: vzeroupper
2729-
; CHECK-NEXT: retq
2736+
; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2737+
; CHECK-FAST: # %bb.0:
2738+
; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1]
2739+
; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1
2740+
; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
2741+
; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2742+
; CHECK-FAST-NEXT: vzeroupper
2743+
; CHECK-FAST-NEXT: retq
2744+
;
2745+
; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2746+
; CHECK-FAST-PERLANE: # %bb.0:
2747+
; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1
2748+
; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
2749+
; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1
2750+
; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2751+
; CHECK-FAST-PERLANE-NEXT: retq
27302752
%vec = load <8 x i64>, ptr %vp
27312753
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
27322754
%cmp = icmp eq <2 x i64> %mask, zeroinitializer

0 commit comments

Comments
 (0)