Skip to content

Commit cb58910

Browse files
committed
[x86] Unify the horizontal adding used for popcount lowering taking the
best approach of each. For vNi16, we use SHL + ADD + SRL pattern that seem easily the best. For vNi32, we use the PUNPCK + PSADBW + PACKUSWB pattern. In some cases there is a huge improvement with this in IACA's estimated throughput -- over 2x higher throughput!!!! -- but the measurements are too good to be true. In one narrow case, the SHL + ADD + SHL + ADD + SRL pattern looks slightly faster, but I'm not sure I believe any of the measurements at this point. Both are the exact same uops though. Hard to be confident of anything past that. If anyone wants to collect very detailed (Agner-level) timings with the result of this patch, or with the i32 case replaced with SHL + ADD + SHl + ADD + SRL, I'd be very interested. Note that you'll need to test it on both Ivybridge and Haswell, with both SSE3, SSSE3, and AVX selected as I saw unique behavior in each of these buckets with IACA all of which should be checked against measured performance. But this patch is still a useful improvement by dropping duplicate work and getting the much nicer PSADBW lowering for v2i64. I'd still like to rephrase this in terms of generic horizontal sum. It's a bit lame to have a special case of that just for popcount. llvm-svn: 238652
1 parent 230d298 commit cb58910

File tree

3 files changed

+67
-150
lines changed

3 files changed

+67
-150
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -17344,63 +17344,19 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
1734417344
return DAG.getBitcast(VT, V);
1734517345
}
1734617346

17347-
// To obtain pop count for each i16 element, shuffle the byte pop count to get
17348-
// even and odd elements into distinct vectors, add them and zero-extend each
17349-
// i8 elemento into i16, i.e.:
17350-
//
17351-
// B -> pop count per i8
17352-
// W -> pop count per i16
17353-
//
17354-
// Y = shuffle B, undef <0, 2, ...>
17355-
// Z = shuffle B, undef <1, 3, ...>
17356-
// W = zext <... x i8> to <... x i16> (Y + Z)
17357-
//
17358-
// Use a byte shuffle mask that matches PSHUFB.
17359-
//
17347+
// The only element type left is i16.
1736017348
assert(EltVT == MVT::i16 && "Unknown how to handle type");
17361-
SDValue Undef = DAG.getUNDEF(ByteVecVT);
17362-
SmallVector<int, 32> MaskA, MaskB;
17363-
17364-
// We can't use PSHUFB across lanes, so do the shuffle and sum inside each
17365-
// 128-bit lane, and then collapse the result.
17366-
int NumLanes = VecSize / 128;
17367-
assert(VecSize % 128 == 0 && "Must have 16-byte multiple vectors!");
17368-
for (int i = 0; i < NumLanes; ++i) {
17369-
for (int j = 0; j < 8; ++j) {
17370-
MaskA.push_back(i * 16 + j * 2);
17371-
MaskB.push_back(i * 16 + (j * 2) + 1);
17372-
}
17373-
MaskA.append((size_t)8, -1);
17374-
MaskB.append((size_t)8, -1);
17375-
}
17376-
17377-
SDValue ShuffA = DAG.getVectorShuffle(ByteVecVT, DL, V, Undef, MaskA);
17378-
SDValue ShuffB = DAG.getVectorShuffle(ByteVecVT, DL, V, Undef, MaskB);
17379-
V = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB);
17380-
17381-
SmallVector<int, 4> Mask;
17382-
for (int i = 0; i < NumLanes; ++i)
17383-
Mask.push_back(2 * i);
17384-
Mask.append((size_t)NumLanes, -1);
17385-
17386-
int NumI64Elts = VecSize / 64;
17387-
MVT VecI64VT = MVT::getVectorVT(MVT::i64, NumI64Elts);
17388-
17389-
V = DAG.getBitcast(VecI64VT, V);
17390-
V = DAG.getVectorShuffle(VecI64VT, DL, V, DAG.getUNDEF(VecI64VT), Mask);
17391-
V = DAG.getBitcast(ByteVecVT, V);
17392-
17393-
// Zero extend i8s into i16 elts
17394-
SmallVector<int, 16> ZExtInRegMask;
17395-
for (int i = 0; i < NumElts; ++i) {
17396-
ZExtInRegMask.push_back(i);
17397-
ZExtInRegMask.push_back(2 * NumElts);
17398-
}
1739917349

17400-
return DAG.getBitcast(
17401-
VT, DAG.getVectorShuffle(ByteVecVT, DL, V,
17402-
getZeroVector(ByteVecVT, Subtarget, DAG, DL),
17403-
ZExtInRegMask));
17350+
// To obtain pop count for each i16 element starting from the pop count for
17351+
// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
17352+
// right by 8. It is important to shift as i16s as i8 vector shift isn't
17353+
// directly supported.
17354+
SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
17355+
SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
17356+
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
17357+
V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
17358+
DAG.getBitcast(ByteVecVT, V));
17359+
return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
1740417360
}
1740517361

1740617362
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
@@ -17526,28 +17482,12 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
1752617482
// At this point, V contains the byte-wise population count, and we are
1752717483
// merely doing a horizontal sum if necessary to get the wider element
1752817484
// counts.
17529-
//
17530-
// FIXME: There is a different lowering strategy above for the horizontal sum
17531-
// of byte-wise population counts. This one and that one should be merged,
17532-
// using the fastest of the two for each size.
17533-
MVT ByteVT = MVT::getVectorVT(MVT::i8, VecSize / 8);
17534-
MVT ShiftVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
17535-
V = DAG.getBitcast(ByteVT, V);
17536-
assert(Len <= 64 && "We don't support element sizes of more than 64 bits!");
17537-
assert(isPowerOf2_32(Len) && "Only power of two element sizes supported!");
17538-
for (int i = Len; i > 8; i /= 2) {
17539-
SDValue Shl = DAG.getBitcast(
17540-
ByteVT, GetShift(ISD::SHL, DAG.getBitcast(ShiftVT, V), i / 2));
17541-
V = DAG.getNode(ISD::ADD, DL, ByteVT, V, Shl);
17542-
}
17543-
17544-
// The high byte now contains the sum of the element bytes. Shift it right
17545-
// (if needed) to make it the low byte.
17546-
V = DAG.getBitcast(VT, V);
17547-
if (Len > 8)
17548-
V = GetShift(ISD::SRL, V, Len - 8);
17485+
if (EltVT == MVT::i8)
17486+
return V;
1754917487

17550-
return V;
17488+
return LowerHorizontalByteSum(
17489+
DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
17490+
DAG);
1755117491
}
1755217492

1755317493
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,

llvm/test/CodeGen/X86/vector-popcnt-128.ll

Lines changed: 39 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,9 @@ define <2 x i64> @testv2i64(<2 x i64> %in) {
2424
; SSE2-NEXT: psrlq $4, %xmm1
2525
; SSE2-NEXT: paddq %xmm0, %xmm1
2626
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
27+
; SSE2-NEXT: pxor %xmm0, %xmm0
28+
; SSE2-NEXT: psadbw %xmm0, %xmm1
2729
; SSE2-NEXT: movdqa %xmm1, %xmm0
28-
; SSE2-NEXT: psllq $32, %xmm0
29-
; SSE2-NEXT: paddb %xmm1, %xmm0
30-
; SSE2-NEXT: movdqa %xmm0, %xmm1
31-
; SSE2-NEXT: psllq $16, %xmm1
32-
; SSE2-NEXT: paddb %xmm0, %xmm1
33-
; SSE2-NEXT: movdqa %xmm1, %xmm0
34-
; SSE2-NEXT: psllq $8, %xmm0
35-
; SSE2-NEXT: paddb %xmm1, %xmm0
36-
; SSE2-NEXT: psrlq $56, %xmm0
3730
; SSE2-NEXT: retq
3831
;
3932
; SSE3-LABEL: testv2i64:
@@ -52,16 +45,9 @@ define <2 x i64> @testv2i64(<2 x i64> %in) {
5245
; SSE3-NEXT: psrlq $4, %xmm1
5346
; SSE3-NEXT: paddq %xmm0, %xmm1
5447
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
48+
; SSE3-NEXT: pxor %xmm0, %xmm0
49+
; SSE3-NEXT: psadbw %xmm0, %xmm1
5550
; SSE3-NEXT: movdqa %xmm1, %xmm0
56-
; SSE3-NEXT: psllq $32, %xmm0
57-
; SSE3-NEXT: paddb %xmm1, %xmm0
58-
; SSE3-NEXT: movdqa %xmm0, %xmm1
59-
; SSE3-NEXT: psllq $16, %xmm1
60-
; SSE3-NEXT: paddb %xmm0, %xmm1
61-
; SSE3-NEXT: movdqa %xmm1, %xmm0
62-
; SSE3-NEXT: psllq $8, %xmm0
63-
; SSE3-NEXT: paddb %xmm1, %xmm0
64-
; SSE3-NEXT: psrlq $56, %xmm0
6551
; SSE3-NEXT: retq
6652
;
6753
; SSSE3-LABEL: testv2i64:
@@ -130,13 +116,14 @@ define <4 x i32> @testv4i32(<4 x i32> %in) {
130116
; SSE2-NEXT: psrld $4, %xmm1
131117
; SSE2-NEXT: paddd %xmm0, %xmm1
132118
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
119+
; SSE2-NEXT: pxor %xmm0, %xmm0
133120
; SSE2-NEXT: movdqa %xmm1, %xmm2
134-
; SSE2-NEXT: psllq $16, %xmm2
135-
; SSE2-NEXT: paddb %xmm1, %xmm2
136-
; SSE2-NEXT: movdqa %xmm2, %xmm0
137-
; SSE2-NEXT: psllq $8, %xmm0
138-
; SSE2-NEXT: paddb %xmm2, %xmm0
139-
; SSE2-NEXT: psrld $24, %xmm0
121+
; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
122+
; SSE2-NEXT: psadbw %xmm0, %xmm2
123+
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
124+
; SSE2-NEXT: psadbw %xmm0, %xmm1
125+
; SSE2-NEXT: packuswb %xmm2, %xmm1
126+
; SSE2-NEXT: movdqa %xmm1, %xmm0
140127
; SSE2-NEXT: retq
141128
;
142129
; SSE3-LABEL: testv4i32:
@@ -155,13 +142,14 @@ define <4 x i32> @testv4i32(<4 x i32> %in) {
155142
; SSE3-NEXT: psrld $4, %xmm1
156143
; SSE3-NEXT: paddd %xmm0, %xmm1
157144
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
145+
; SSE3-NEXT: pxor %xmm0, %xmm0
158146
; SSE3-NEXT: movdqa %xmm1, %xmm2
159-
; SSE3-NEXT: psllq $16, %xmm2
160-
; SSE3-NEXT: paddb %xmm1, %xmm2
161-
; SSE3-NEXT: movdqa %xmm2, %xmm0
162-
; SSE3-NEXT: psllq $8, %xmm0
163-
; SSE3-NEXT: paddb %xmm2, %xmm0
164-
; SSE3-NEXT: psrld $24, %xmm0
147+
; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
148+
; SSE3-NEXT: psadbw %xmm0, %xmm2
149+
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
150+
; SSE3-NEXT: psadbw %xmm0, %xmm1
151+
; SSE3-NEXT: packuswb %xmm2, %xmm1
152+
; SSE3-NEXT: movdqa %xmm1, %xmm0
165153
; SSE3-NEXT: retq
166154
;
167155
; SSSE3-LABEL: testv4i32:
@@ -247,7 +235,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
247235
; SSE2-NEXT: paddw %xmm0, %xmm1
248236
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
249237
; SSE2-NEXT: movdqa %xmm1, %xmm0
250-
; SSE2-NEXT: psllq $8, %xmm0
238+
; SSE2-NEXT: psllw $8, %xmm0
251239
; SSE2-NEXT: paddb %xmm1, %xmm0
252240
; SSE2-NEXT: psrlw $8, %xmm0
253241
; SSE2-NEXT: retq
@@ -269,30 +257,27 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
269257
; SSE3-NEXT: paddw %xmm0, %xmm1
270258
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
271259
; SSE3-NEXT: movdqa %xmm1, %xmm0
272-
; SSE3-NEXT: psllq $8, %xmm0
260+
; SSE3-NEXT: psllw $8, %xmm0
273261
; SSE3-NEXT: paddb %xmm1, %xmm0
274262
; SSE3-NEXT: psrlw $8, %xmm0
275263
; SSE3-NEXT: retq
276264
;
277265
; SSSE3-LABEL: testv8i16:
278266
; SSSE3: # BB#0:
279-
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
280-
; SSSE3-NEXT: movdqa %xmm0, %xmm3
281-
; SSSE3-NEXT: pand %xmm2, %xmm3
282-
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
283-
; SSSE3-NEXT: movdqa %xmm1, %xmm4
284-
; SSSE3-NEXT: pshufb %xmm3, %xmm4
267+
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
268+
; SSSE3-NEXT: movdqa %xmm0, %xmm2
269+
; SSSE3-NEXT: pand %xmm1, %xmm2
270+
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
271+
; SSSE3-NEXT: movdqa %xmm3, %xmm4
272+
; SSSE3-NEXT: pshufb %xmm2, %xmm4
285273
; SSSE3-NEXT: psrlw $4, %xmm0
286-
; SSSE3-NEXT: pand %xmm2, %xmm0
287-
; SSSE3-NEXT: pshufb %xmm0, %xmm1
288-
; SSSE3-NEXT: paddb %xmm4, %xmm1
289-
; SSSE3-NEXT: movdqa %xmm1, %xmm0
290-
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
291-
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
292-
; SSSE3-NEXT: paddb %xmm0, %xmm1
293-
; SSSE3-NEXT: pxor %xmm0, %xmm0
294-
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
295-
; SSSE3-NEXT: movdqa %xmm1, %xmm0
274+
; SSSE3-NEXT: pand %xmm1, %xmm0
275+
; SSSE3-NEXT: pshufb %xmm0, %xmm3
276+
; SSSE3-NEXT: paddb %xmm4, %xmm3
277+
; SSSE3-NEXT: movdqa %xmm3, %xmm0
278+
; SSSE3-NEXT: psllw $8, %xmm0
279+
; SSSE3-NEXT: paddb %xmm3, %xmm0
280+
; SSSE3-NEXT: psrlw $8, %xmm0
296281
; SSSE3-NEXT: retq
297282
;
298283
; SSE41-LABEL: testv8i16:
@@ -308,10 +293,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
308293
; SSE41-NEXT: pshufb %xmm0, %xmm3
309294
; SSE41-NEXT: paddb %xmm4, %xmm3
310295
; SSE41-NEXT: movdqa %xmm3, %xmm0
311-
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
312-
; SSE41-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
313-
; SSE41-NEXT: paddb %xmm0, %xmm3
314-
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
296+
; SSE41-NEXT: psllw $8, %xmm0
297+
; SSE41-NEXT: paddb %xmm3, %xmm0
298+
; SSE41-NEXT: psrlw $8, %xmm0
315299
; SSE41-NEXT: retq
316300
;
317301
; AVX-LABEL: testv8i16:
@@ -324,10 +308,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
324308
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
325309
; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
326310
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
327-
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
328-
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
329-
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
330-
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
311+
; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
312+
; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
313+
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
331314
; AVX-NEXT: retq
332315
%out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
333316
ret <8 x i16> %out

llvm/test/CodeGen/X86/vector-popcnt-256.ll

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -108,23 +108,19 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
108108
; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
109109
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
110110
; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
111-
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
112-
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
113-
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
114-
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
115-
; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
116-
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
111+
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4
112+
; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
113+
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
117114
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
118-
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
119-
; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
115+
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4
116+
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
120117
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
121118
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
122119
; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
123-
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
124-
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm1
125-
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
126-
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
127-
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
120+
; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
121+
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
122+
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
123+
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
128124
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
129125
; AVX1-NEXT: retq
130126
;
@@ -138,11 +134,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) {
138134
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
139135
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
140136
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
141-
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
142-
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
143-
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
144-
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
145-
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
137+
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
138+
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
139+
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
146140
; AVX2-NEXT: retq
147141
%out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
148142
ret <16 x i16> %out

0 commit comments

Comments
 (0)