Skip to content

[X86] lowerShuffleAsBroadcast - use isShuffleEquivalent to search for a hidden broadcast pattern #126517

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9800,6 +9800,24 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
MaskSize == (int)ExpectedOp.getNumOperands())
return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
break;
case ISD::BITCAST:
if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have more than one bitcast adjacent? Can we use getOperand(0)?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it happens as DAG combines still don't occur in topological order - and its common in shuffle lowering where we are bitcasting between shuffle widths so much.

SDValue Src = peekThroughBitcasts(Op);
EVT SrcVT = Src.getValueType();
if (SrcVT.isVector() &&
(SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
return (Idx % Scale) == (ExpectedIdx % Scale) &&
IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
Idx / Scale, ExpectedIdx / Scale);
}
}
break;
case ISD::VECTOR_SHUFFLE: {
auto *SVN = cast<ShuffleVectorSDNode>(Op);
return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize &&
SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
}
case X86ISD::VBROADCAST:
case X86ISD::VBROADCAST_LOAD:
// TODO: Handle MaskSize != VT.getVectorNumElements()?
Expand Down Expand Up @@ -12780,8 +12798,13 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

// Check that the mask is a broadcast.
int BroadcastIdx = getSplatIndex(Mask);
if (BroadcastIdx < 0)
return SDValue();
if (BroadcastIdx < 0) {
// Check for hidden broadcast.
SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
return SDValue();
BroadcastIdx = 0;
}
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast "
"comes from V1.");
Expand Down
91 changes: 39 additions & 52 deletions llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1220,7 +1220,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
Expand All @@ -1234,7 +1234,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1247,7 +1247,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1259,10 +1259,9 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
Expand Down Expand Up @@ -1345,7 +1344,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
Expand All @@ -1359,7 +1358,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1372,7 +1371,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1384,10 +1383,9 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
Expand Down Expand Up @@ -1719,7 +1717,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1732,7 +1730,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -1745,7 +1743,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
Expand Down Expand Up @@ -2691,14 +2689,13 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Expand All @@ -2708,10 +2705,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -2724,10 +2720,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -2739,11 +2734,10 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
Expand Down Expand Up @@ -2959,14 +2953,13 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Expand All @@ -2976,10 +2969,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -2992,10 +2984,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
Expand All @@ -3007,11 +2998,10 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
Expand Down Expand Up @@ -3742,14 +3732,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Expand All @@ -3759,9 +3748,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
Expand All @@ -3775,9 +3763,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
Expand Down
Loading