Skip to content

Commit 3ce544e

Browse files
committed
[X86] lowerShuffleAsBroadcast - improve handling of non-zero element index broadcasts
On AVX2+, support broadcasting of any element if it occurs in the bottom 128-bit subvector by shuffling the element down to element 0 and then broadcasting. Fixes #113396
1 parent 6c3d374 commit 3ce544e

File tree

4 files changed

+66
-57
lines changed

4 files changed

+66
-57
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12637,6 +12637,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
1263712637
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
1263812638
"a sorted mask where the broadcast "
1263912639
"comes from V1.");
12640+
int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
1264012641

1264112642
// Go up the chain of (vector) values to find a scalar load that we can
1264212643
// combine with the broadcast.
@@ -12756,16 +12757,28 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
1275612757
if (VT == MVT::v4f64 || VT == MVT::v4i64)
1275712758
return SDValue();
1275812759

12759-
// Only broadcast the zero-element of a 128-bit subvector.
12760-
if ((BitOffset % 128) != 0)
12761-
return SDValue();
12760+
// If we are broadcasting an element from the lowest 128-bit subvector, try
12761+
// to move the element in position.
12762+
if (BitOffset < 128 && NumActiveElts > 1 &&
12763+
V.getScalarValueSizeInBits() == NumEltBits) {
12764+
assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12765+
"Unexpected bit-offset");
12766+
SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
12767+
ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
12768+
V = extractSubVector(V, 0, DAG, DL, 128);
12769+
V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
12770+
} else {
12771+
// Only broadcast the zero-element of a 128-bit subvector.
12772+
if ((BitOffset % 128) != 0)
12773+
return SDValue();
1276212774

12763-
assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12764-
"Unexpected bit-offset");
12765-
assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12766-
"Unexpected vector size");
12767-
unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12768-
V = extract128BitVector(V, ExtractIdx, DAG, DL);
12775+
assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12776+
"Unexpected bit-offset");
12777+
assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12778+
"Unexpected vector size");
12779+
unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12780+
V = extract128BitVector(V, ExtractIdx, DAG, DL);
12781+
}
1276912782
}
1277012783

1277112784
// On AVX we can use VBROADCAST directly for scalar sources.

llvm/test/CodeGen/X86/matrix-multiply.ll

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2563,15 +2563,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
25632563
; AVX512F-NEXT: vbroadcastss %xmm4, %ymm12
25642564
; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12
25652565
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3]
2566-
; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13
2566+
; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
25672567
; AVX512F-NEXT: vmulps %ymm13, %ymm11, %ymm13
25682568
; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12
2569-
; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2]
2570-
; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13
2569+
; AVX512F-NEXT: vshufpd {{.*#+}} xmm13 = xmm4[1,0]
2570+
; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
25712571
; AVX512F-NEXT: vmulps %ymm1, %ymm13, %ymm13
25722572
; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12
25732573
; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3]
2574-
; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13
2574+
; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
25752575
; AVX512F-NEXT: vmulps %ymm13, %ymm10, %ymm13
25762576
; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12
25772577
; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm13
@@ -2627,15 +2627,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
26272627
; AVX512F-NEXT: vbroadcastss %xmm5, %ymm13
26282628
; AVX512F-NEXT: vmulps %ymm0, %ymm13, %ymm13
26292629
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3]
2630-
; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
2630+
; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
26312631
; AVX512F-NEXT: vmulps %ymm14, %ymm11, %ymm14
26322632
; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13
2633-
; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2]
2634-
; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
2633+
; AVX512F-NEXT: vshufpd {{.*#+}} xmm14 = xmm5[1,0]
2634+
; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
26352635
; AVX512F-NEXT: vmulps %ymm1, %ymm14, %ymm14
26362636
; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13
26372637
; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3]
2638-
; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
2638+
; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
26392639
; AVX512F-NEXT: vmulps %ymm14, %ymm10, %ymm14
26402640
; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13
26412641
; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm14
@@ -2689,15 +2689,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
26892689
; AVX512F-NEXT: vbroadcastss %xmm6, %ymm12
26902690
; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12
26912691
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm6[1,1,3,3]
2692-
; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
2692+
; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
26932693
; AVX512F-NEXT: vmulps %ymm14, %ymm11, %ymm14
26942694
; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12
2695-
; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2]
2696-
; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
2695+
; AVX512F-NEXT: vshufpd {{.*#+}} xmm14 = xmm6[1,0]
2696+
; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
26972697
; AVX512F-NEXT: vmulps %ymm1, %ymm14, %ymm14
26982698
; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12
26992699
; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm6[3,3,3,3]
2700-
; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
2700+
; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
27012701
; AVX512F-NEXT: vmulps %ymm14, %ymm10, %ymm14
27022702
; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12
27032703
; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm14
@@ -2753,15 +2753,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
27532753
; AVX512F-NEXT: vbroadcastss %xmm7, %ymm12
27542754
; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12
27552755
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3]
2756-
; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
2756+
; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15
27572757
; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm15
27582758
; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12
2759-
; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,2,2]
2760-
; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
2759+
; AVX512F-NEXT: vshufpd {{.*#+}} xmm15 = xmm7[1,0]
2760+
; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15
27612761
; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15
27622762
; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12
27632763
; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,3,3,3]
2764-
; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
2764+
; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15
27652765
; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15
27662766
; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12
27672767
; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm15
@@ -2828,15 +2828,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
28282828
; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm12
28292829
; AVX512VL-NEXT: vmulps %ymm0, %ymm12, %ymm12
28302830
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3]
2831-
; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13
2831+
; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13
28322832
; AVX512VL-NEXT: vmulps %ymm13, %ymm11, %ymm13
28332833
; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12
2834-
; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2]
2835-
; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13
2834+
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm13 = xmm4[1,0]
2835+
; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13
28362836
; AVX512VL-NEXT: vmulps %ymm1, %ymm13, %ymm13
28372837
; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12
28382838
; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3]
2839-
; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13
2839+
; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13
28402840
; AVX512VL-NEXT: vmulps %ymm13, %ymm10, %ymm13
28412841
; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12
28422842
; AVX512VL-NEXT: vextractf128 $1, %ymm4, %xmm13
@@ -2890,15 +2890,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
28902890
; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm13
28912891
; AVX512VL-NEXT: vmulps %ymm0, %ymm13, %ymm13
28922892
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3]
2893-
; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14
2893+
; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14
28942894
; AVX512VL-NEXT: vmulps %ymm14, %ymm11, %ymm14
28952895
; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13
2896-
; AVX512VL-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2]
2897-
; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14
2896+
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm14 = xmm5[1,0]
2897+
; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14
28982898
; AVX512VL-NEXT: vmulps %ymm1, %ymm14, %ymm14
28992899
; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13
29002900
; AVX512VL-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3]
2901-
; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14
2901+
; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14
29022902
; AVX512VL-NEXT: vmulps %ymm14, %ymm10, %ymm14
29032903
; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13
29042904
; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm14
@@ -2952,15 +2952,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
29522952
; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm14
29532953
; AVX512VL-NEXT: vmulps %ymm0, %ymm14, %ymm14
29542954
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm6[1,1,3,3]
2955-
; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
2955+
; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
29562956
; AVX512VL-NEXT: vmulps %ymm15, %ymm11, %ymm15
29572957
; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14
2958-
; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[2,2,2,2]
2959-
; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
2958+
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm15 = xmm6[1,0]
2959+
; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
29602960
; AVX512VL-NEXT: vmulps %ymm1, %ymm15, %ymm15
29612961
; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14
29622962
; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[3,3,3,3]
2963-
; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
2963+
; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
29642964
; AVX512VL-NEXT: vmulps %ymm15, %ymm10, %ymm15
29652965
; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14
29662966
; AVX512VL-NEXT: vextractf128 $1, %ymm6, %xmm15
@@ -3014,15 +3014,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
30143014
; AVX512VL-NEXT: vbroadcastss %xmm7, %ymm15
30153015
; AVX512VL-NEXT: vmulps %ymm0, %ymm15, %ymm15
30163016
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm7[1,1,3,3]
3017-
; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
3017+
; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16
30183018
; AVX512VL-NEXT: vmulps %ymm16, %ymm11, %ymm16
30193019
; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15
3020-
; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm7[2,2,2,2]
3021-
; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
3020+
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm16 = xmm7[1,0]
3021+
; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16
30223022
; AVX512VL-NEXT: vmulps %ymm16, %ymm1, %ymm16
30233023
; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15
30243024
; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm7[3,3,3,3]
3025-
; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
3025+
; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16
30263026
; AVX512VL-NEXT: vmulps %ymm16, %ymm10, %ymm16
30273027
; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15
30283028
; AVX512VL-NEXT: vextractf32x4 $1, %ymm7, %xmm16

llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1453,32 +1453,28 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_
14531453
define <64 x i8> @shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01(<8 x i8> %0) {
14541454
; AVX512F-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01:
14551455
; AVX512F: # %bb.0:
1456-
; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1457-
; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm0
1458-
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
1456+
; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0
1457+
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
14591458
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14601459
; AVX512F-NEXT: retq
14611460
;
14621461
; AVX512BW-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01:
14631462
; AVX512BW: # %bb.0:
1464-
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1465-
; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm0
1466-
; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0
1463+
; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
1464+
; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
14671465
; AVX512BW-NEXT: retq
14681466
;
14691467
; AVX512DQ-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01:
14701468
; AVX512DQ: # %bb.0:
1471-
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1472-
; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm0
1473-
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
1469+
; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm0
1470+
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
14741471
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14751472
; AVX512DQ-NEXT: retq
14761473
;
14771474
; AVX512VBMI-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01:
14781475
; AVX512VBMI: # %bb.0:
1479-
; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1480-
; AVX512VBMI-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1481-
; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
1476+
; AVX512VBMI-NEXT: vpsrlw $8, %xmm0, %xmm0
1477+
; AVX512VBMI-NEXT: vpbroadcastb %xmm0, %zmm0
14821478
; AVX512VBMI-NEXT: retq
14831479
%s = shufflevector <8 x i8> %0, <8 x i8> poison, <64 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
14841480
ret <64 x i8> %s

llvm/test/CodeGen/X86/vector-shuffle-v1.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -501,8 +501,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
501501
; AVX512VL-FAST-PERLANE-NEXT: kmovw %edi, %k1
502502
; AVX512VL-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
503503
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
504-
; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
505-
; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1
504+
; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
505+
; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm1
506506
; AVX512VL-FAST-PERLANE-NEXT: vpslld $31, %ymm1, %ymm1
507507
; AVX512VL-FAST-PERLANE-NEXT: vptestmd %ymm1, %ymm1, %k1
508508
; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -525,8 +525,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
525525
; VL_BW_DQ-FAST-PERLANE: # %bb.0:
526526
; VL_BW_DQ-FAST-PERLANE-NEXT: kmovd %edi, %k0
527527
; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0
528-
; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
529-
; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm0
528+
; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
529+
; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm0
530530
; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0
531531
; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2w %k0, %xmm0
532532
; VL_BW_DQ-FAST-PERLANE-NEXT: vzeroupper

0 commit comments

Comments
 (0)