Skip to content

Commit e09c01f

Browse files
committed
[X86] lowerShuffleAsBroadcast - use isShuffleEquivalent to search for a hidden broadcast pattern
lowerShuffleAsBroadcast only matches a known-splat shuffle mask, but we can use the isShuffleEquivalent/IsElementEquivalent helpers to attempt to find a hidden broadcast-able shuffle pattern. This requires an extension to IsElementEquivalent to peek through bitcasts to match against wider shuffles - these typically appear during shuffle lowering where we've widened a preceding shuffle, often to a vector concatenation etc. Amazingly I hit this while yak shaving llvm#126033 .......
1 parent c6b13a2 commit e09c01f

13 files changed

+332
-371
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9799,6 +9799,24 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
97999799
MaskSize == (int)ExpectedOp.getNumOperands())
98009800
return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
98019801
break;
9802+
case ISD::BITCAST:
9803+
if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) {
9804+
SDValue Src = peekThroughBitcasts(Op);
9805+
EVT SrcVT = Src.getValueType();
9806+
if (SrcVT.isVector() &&
9807+
(SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9808+
unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9809+
return (Idx % Scale) == (ExpectedIdx % Scale) &&
9810+
IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9811+
Idx / Scale, ExpectedIdx / Scale);
9812+
}
9813+
}
9814+
break;
9815+
case ISD::VECTOR_SHUFFLE: {
9816+
auto *SVN = cast<ShuffleVectorSDNode>(Op);
9817+
return Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize &&
9818+
SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9819+
}
98029820
case X86ISD::VBROADCAST:
98039821
case X86ISD::VBROADCAST_LOAD:
98049822
// TODO: Handle MaskSize != VT.getVectorNumElements()?
@@ -12779,8 +12797,13 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
1277912797

1278012798
// Check that the mask is a broadcast.
1278112799
int BroadcastIdx = getSplatIndex(Mask);
12782-
if (BroadcastIdx < 0)
12783-
return SDValue();
12800+
if (BroadcastIdx < 0) {
12801+
// Check for hidden broadcast.
12802+
SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
12803+
if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
12804+
return SDValue();
12805+
BroadcastIdx = 0;
12806+
}
1278412807
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
1278512808
"a sorted mask where the broadcast "
1278612809
"comes from V1.");

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 39 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,7 +1220,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
12201220
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
12211221
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
12221222
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1223-
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1223+
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
12241224
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
12251225
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12261226
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -1234,7 +1234,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
12341234
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
12351235
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
12361236
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1237-
; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
1237+
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
12381238
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
12391239
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
12401240
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1247,7 +1247,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
12471247
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
12481248
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
12491249
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1250-
; AVX512DQ-NEXT: vpbroadcastd %xmm1, %ymm1
1250+
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
12511251
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
12521252
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
12531253
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1259,10 +1259,9 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
12591259
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
12601260
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
12611261
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1262-
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
12631262
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
12641263
; AVX512BW-NEXT: kmovd %eax, %k1
1265-
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
1264+
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
12661265
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
12671266
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
12681267
; AVX512BW-NEXT: vzeroupper
@@ -1345,7 +1344,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
13451344
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
13461345
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
13471346
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1348-
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1347+
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
13491348
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
13501349
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
13511350
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -1359,7 +1358,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
13591358
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
13601359
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
13611360
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1362-
; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1361+
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
13631362
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
13641363
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm0
13651364
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1372,7 +1371,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
13721371
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
13731372
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
13741373
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1375-
; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1374+
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
13761375
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
13771376
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm0
13781377
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1384,10 +1383,9 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
13841383
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
13851384
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
13861385
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1387-
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm0
13881386
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
13891387
; AVX512BW-NEXT: kmovd %eax, %k1
1390-
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
1388+
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
13911389
; AVX512BW-NEXT: vpaddb (%rdx), %zmm1, %zmm0
13921390
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
13931391
; AVX512BW-NEXT: vzeroupper
@@ -1719,7 +1717,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
17191717
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
17201718
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
17211719
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1722-
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
1720+
; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
17231721
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
17241722
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
17251723
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1732,7 +1730,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
17321730
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
17331731
; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
17341732
; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1735-
; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
1733+
; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
17361734
; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
17371735
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
17381736
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1745,7 +1743,7 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in.
17451743
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
17461744
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
17471745
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
1748-
; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1
1746+
; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
17491747
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
17501748
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
17511749
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
@@ -2691,14 +2689,13 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
26912689
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
26922690
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
26932691
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2694-
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2
2695-
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2696-
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2697-
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2692+
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
2693+
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
2694+
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
26982695
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
26992696
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2700-
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
27012697
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2698+
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
27022699
; AVX2-NEXT: vzeroupper
27032700
; AVX2-NEXT: retq
27042701
;
@@ -2708,10 +2705,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
27082705
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
27092706
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
27102707
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2711-
; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2
2712-
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
27132708
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2714-
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2709+
; AVX512F-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2710+
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
27152711
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
27162712
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
27172713
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2724,10 +2720,9 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
27242720
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
27252721
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
27262722
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2727-
; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2
2728-
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
27292723
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2730-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2724+
; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2725+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
27312726
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
27322727
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
27332728
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2739,11 +2734,10 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
27392734
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
27402735
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
27412736
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
2742-
; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm2
2737+
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
27432738
; AVX512BW-NEXT: movl $286331153, %eax # imm = 0x11111111
27442739
; AVX512BW-NEXT: kmovd %eax, %k1
2745-
; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
2746-
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
2740+
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
27472741
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
27482742
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
27492743
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -2959,14 +2953,13 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
29592953
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
29602954
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
29612955
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2962-
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2
2963-
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2964-
; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2965-
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
2956+
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
2957+
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2958+
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
29662959
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
29672960
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
2968-
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
29692961
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
2962+
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
29702963
; AVX2-NEXT: vzeroupper
29712964
; AVX2-NEXT: retq
29722965
;
@@ -2976,10 +2969,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
29762969
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
29772970
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
29782971
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2979-
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
2980-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
29812972
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
2982-
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2973+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2974+
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
29832975
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
29842976
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
29852977
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2992,10 +2984,9 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
29922984
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
29932985
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
29942986
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
2995-
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
2996-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
29972987
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
2998-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
2988+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
2989+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
29992990
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
30002991
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
30012992
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3007,11 +2998,10 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
30072998
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
30082999
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
30093000
; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
3010-
; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2
3001+
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
30113002
; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101
30123003
; AVX512BW-NEXT: kmovd %eax, %k1
3013-
; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1}
3014-
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
3004+
; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
30153005
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
30163006
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
30173007
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3742,14 +3732,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37423732
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
37433733
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
37443734
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3745-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2
37463735
; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
3747-
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
3748-
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3749-
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
3750-
; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
3751-
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
3752-
; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
3736+
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
3737+
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
3738+
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
3739+
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
3740+
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
3741+
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
37533742
; AVX2-NEXT: vzeroupper
37543743
; AVX2-NEXT: retq
37553744
;
@@ -3759,9 +3748,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37593748
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
37603749
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
37613750
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3762-
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
3763-
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
37643751
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
3752+
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
37653753
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
37663754
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
37673755
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3775,9 +3763,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
37753763
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
37763764
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
37773765
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
3778-
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
3779-
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
37803766
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
3767+
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
37813768
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
37823769
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
37833770
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)

0 commit comments

Comments
 (0)