Skip to content

Commit ae1bb7c

Browse files
committed
[X86] Fold VPERMV(MASK,CONCAT(LO,HI)) -> VPERMV3(WIDEN(LO),MASK',WIDEN(HI))
If the VPERMV node is shuffling a source that is concatenated from separate subvectors, attempt to shuffle from the separate subvectors directly using an equivalent VPERMV3 node
1 parent 2c7e7b5 commit ae1bb7c

13 files changed

+7308
-7591
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42607,6 +42607,43 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4260742607

4260842608
return SDValue();
4260942609
}
42610+
case X86ISD::VPERMV: {
42611+
// Combine VPERMV to VPERMV3 if the source operand can be freely split.
42612+
SmallVector<int, 32> Mask;
42613+
SmallVector<SDValue, 2> SrcOps, SubOps;
42614+
SDValue Src = peekThroughBitcasts(N.getOperand(1));
42615+
if ((Subtarget.hasVLX() ||
42616+
(VT.is512BitVector() && Subtarget.hasAVX512())) &&
42617+
getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
42618+
collectConcatOps(Src.getNode(), SubOps, DAG)) {
42619+
assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42620+
assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
42621+
assert((SubOps.size() == 2 || SubOps.size() == 4) &&
42622+
"Unexpected split ops");
42623+
// Bail if we were permuting a widened vector.
42624+
if (SubOps[SubOps.size() - 1].isUndef())
42625+
return SDValue();
42626+
// Bail if any subops would have folded into the concat.
42627+
if (any_of(SubOps, [](SDValue Op) { return isShuffleFoldableLoad(Op); }))
42628+
return SDValue();
42629+
// Concat 4x128 back to 2x256.
42630+
if (SubOps.size() == 4) {
42631+
SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
42632+
SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
42633+
}
42634+
// Convert mask to 2 operand shuffle.
42635+
int HalfElts = NumElts / 2;
42636+
for (int &M : Mask)
42637+
M += M >= HalfElts ? HalfElts : 0;
42638+
SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
42639+
VT.getSizeInBits());
42640+
SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
42641+
VT.getSizeInBits());
42642+
return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
42643+
DAG.getBitcast(VT, Hi), Subtarget, DAG);
42644+
}
42645+
return SDValue();
42646+
}
4261042647
case X86ISD::VPERMV3: {
4261142648
// Combine VPERMV3 to widened VPERMV if the two source operands can be
4261242649
// freely concatenated.

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 80 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -4242,33 +4242,31 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
42424242
; AVX512F: # %bb.0:
42434243
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
42444244
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4245-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
42464245
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4247-
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4248-
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4249-
; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
4250-
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4251-
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4252-
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4253-
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4254-
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4246+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4247+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,0,23]
4248+
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4249+
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4250+
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4251+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4252+
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4253+
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
42554254
; AVX512F-NEXT: vzeroupper
42564255
; AVX512F-NEXT: retq
42574256
;
42584257
; AVX512DQ-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
42594258
; AVX512DQ: # %bb.0:
42604259
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
42614260
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4262-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
42634261
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4264-
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4265-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
4266-
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
4267-
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4268-
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4269-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4270-
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4271-
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
4262+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4263+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,0,23]
4264+
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4265+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4266+
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4267+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4268+
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
4269+
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
42724270
; AVX512DQ-NEXT: vzeroupper
42734271
; AVX512DQ-NEXT: retq
42744272
;
@@ -4371,33 +4369,31 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
43714369
; AVX512F: # %bb.0:
43724370
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
43734371
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4374-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
43754372
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4376-
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4377-
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4378-
; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
4379-
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4380-
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4381-
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4382-
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4383-
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4373+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4374+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,0]
4375+
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4376+
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4377+
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4378+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4379+
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4380+
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
43844381
; AVX512F-NEXT: vzeroupper
43854382
; AVX512F-NEXT: retq
43864383
;
43874384
; AVX512DQ-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4:
43884385
; AVX512DQ: # %bb.0:
43894386
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
43904387
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4391-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
43924388
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4393-
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4394-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
4395-
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
4396-
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4397-
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4398-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4399-
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4400-
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
4389+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4390+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,0]
4391+
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4392+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4393+
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4394+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4395+
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
4396+
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
44014397
; AVX512DQ-NEXT: vzeroupper
44024398
; AVX512DQ-NEXT: retq
44034399
;
@@ -4508,33 +4504,31 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
45084504
; AVX512F: # %bb.0:
45094505
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
45104506
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4511-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
45124507
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4513-
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4514-
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4515-
; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
4516-
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4517-
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4518-
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4519-
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4520-
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4508+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4509+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4510+
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4511+
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4512+
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4513+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4514+
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4515+
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
45214516
; AVX512F-NEXT: vzeroupper
45224517
; AVX512F-NEXT: retq
45234518
;
45244519
; AVX512DQ-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
45254520
; AVX512DQ: # %bb.0:
45264521
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
45274522
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4528-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
45294523
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4530-
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4531-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4532-
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
4533-
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4534-
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4535-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4536-
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4537-
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
4524+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4525+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4526+
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4527+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4528+
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4529+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4530+
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
4531+
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
45384532
; AVX512DQ-NEXT: vzeroupper
45394533
; AVX512DQ-NEXT: retq
45404534
;
@@ -4636,12 +4630,11 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
46364630
; AVX512F: # %bb.0:
46374631
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
46384632
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4639-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
46404633
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4641-
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4642-
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4643-
; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0
4644-
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4634+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4635+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4636+
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4637+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
46454638
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
46464639
; AVX512F-NEXT: vzeroupper
46474640
; AVX512F-NEXT: retq
@@ -4650,12 +4643,11 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
46504643
; AVX512DQ: # %bb.0:
46514644
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
46524645
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4653-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
46544646
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4655-
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4656-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
4657-
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
4658-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4647+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4648+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4649+
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4650+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
46594651
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
46604652
; AVX512DQ-NEXT: vzeroupper
46614653
; AVX512DQ-NEXT: retq
@@ -4769,33 +4761,31 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
47694761
; AVX512F: # %bb.0:
47704762
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
47714763
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4772-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
47734764
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4774-
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4775-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4776-
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
4777-
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4778-
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4779-
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4780-
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
4781-
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
4765+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4766+
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4767+
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4768+
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4769+
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4770+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4771+
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
4772+
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
47824773
; AVX512F-NEXT: vzeroupper
47834774
; AVX512F-NEXT: retq
47844775
;
47854776
; AVX512DQ-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
47864777
; AVX512DQ: # %bb.0:
47874778
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
47884779
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4789-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
47904780
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4791-
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4792-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4793-
; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
4794-
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
4795-
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
4796-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4797-
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
4798-
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
4781+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4782+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4783+
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4784+
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
4785+
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
4786+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
4787+
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
4788+
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
47994789
; AVX512DQ-NEXT: vzeroupper
48004790
; AVX512DQ-NEXT: retq
48014791
;
@@ -4898,12 +4888,11 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
48984888
; AVX512F: # %bb.0:
48994889
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
49004890
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4901-
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
49024891
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4903-
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4904-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4905-
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
4906-
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4892+
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4893+
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4894+
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4895+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
49074896
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
49084897
; AVX512F-NEXT: vzeroupper
49094898
; AVX512F-NEXT: retq
@@ -4912,12 +4901,11 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
49124901
; AVX512DQ: # %bb.0:
49134902
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
49144903
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4915-
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
49164904
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4917-
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4918-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
4919-
; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
4920-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
4905+
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4906+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4907+
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4908+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
49214909
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
49224910
; AVX512DQ-NEXT: vzeroupper
49234911
; AVX512DQ-NEXT: retq

0 commit comments

Comments
 (0)