Skip to content

Commit 3268bde

Browse files
committed
fixed corner cases with shift amt > 8 or undef
Updated affected tests
1 parent 0c7f8f2 commit 3268bde

File tree

9 files changed

+155
-176
lines changed

9 files changed

+155
-176
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29781,7 +29781,7 @@ template <typename InputTy, typename PermutationTy,
2978129781
8>>
2978229782
static bool PermuteAndPairVector(
2978329783
const InputTy &Inputs, PermutationTy &Permutation,
29784-
MapTy UnpairedInputs = MapTy()) {static_assert(std::is_same<typename InputTy::value_type, uint8_t>::value);
29784+
MapTy UnpairedInputs = MapTy()) {
2978529785
const typename InputTy::value_type Wildcard = ~0;
2978629786
SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
2978729787

@@ -30160,10 +30160,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3016030160
// widened, and P2^-1 is the inverse shuffle of P2.
3016130161
// This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
3016230162
// variable shift instructions.
30163+
// Picking out GFNI because normally it implies AVX512, and there is no
30164+
// latency data for CPU with GFNI and SSE or AVX only, but there are tests for
30165+
// such combination anyways.
3016330166
if (ConstantAmt &&
3016430167
(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
3016530168
R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
30166-
!Subtarget.hasXOP()) {
30169+
!Subtarget.hasXOP() && !Subtarget.hasGFNI()) {
3016730170
constexpr size_t LaneBytes = 16;
3016830171
const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
3016930172

@@ -30172,8 +30175,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3017230175
for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
3017330176
if (Amt.getOperand(I).isUndef())
3017430177
ShiftAmt.push_back(~0);
30175-
else
30176-
ShiftAmt.push_back(Amt.getConstantOperandVal(I));
30178+
else {
30179+
auto A = Amt.getConstantOperandVal(I);
30180+
ShiftAmt.push_back(A > 8 ? 8 : A);
30181+
}
3017730182
}
3017830183

3017930184
// Check if we can find an in-lane shuffle to rearrange the shift amounts,
@@ -30221,7 +30226,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3022130226
// For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
3022230227
// amount of 0, making it unprofitable.
3022330228
if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
30224-
any_of(ShiftAmt, [](auto x) { return x == 0; }))
30229+
any_of(ShiftAmt, [](uint8_t x) { return x == 0; }))
3022530230
Profitable = false;
3022630231

3022730232
bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
@@ -30252,14 +30257,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3025230257
for (int Index : Permutation) {
3025330258
NewShiftAmt.push_back(Amt.getOperand(Index));
3025430259
}
30255-
#ifndef NDEBUG
30260+
// If using (V)PMULHUW, any undef pair is resolved to shift by 8 so that
30261+
// it does not create extra instructions in case it is resolved to 0.
3025630262
for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
30257-
SDValue Even = NewShiftAmt[I];
30258-
SDValue Odd = NewShiftAmt[I + 1];
30263+
SDValue &Even = NewShiftAmt[I];
30264+
SDValue &Odd = NewShiftAmt[I + 1];
3025930265
assert(Even.isUndef() || Odd.isUndef() ||
3026030266
Even->getAsZExtVal() == Odd->getAsZExtVal());
30267+
if (!IsAdjacentQuads && Even.isUndef() && Odd.isUndef())
30268+
Even = DAG.getConstant(8, dl, VT.getScalarType());
3026130269
}
30262-
#endif
30270+
3026330271
SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
3026430272
SDValue NewShift = DAG.getNode(Opc, dl, VT, InnerShuffle, NewShiftVector);
3026530273
SmallVector<int, 64> InversePermutation(Permutation.size());

llvm/test/CodeGen/X86/combine-sdiv.ll

Lines changed: 43 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -351,32 +351,20 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
351351
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
352352
; SSE41: # %bb.0:
353353
; SSE41-NEXT: movdqa %xmm0, %xmm1
354-
; SSE41-NEXT: pxor %xmm0, %xmm0
355-
; SSE41-NEXT: pxor %xmm3, %xmm3
356-
; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
357-
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
358-
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
359-
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
360-
; SSE41-NEXT: pmullw %xmm0, %xmm3
361-
; SSE41-NEXT: psrlw $8, %xmm3
362-
; SSE41-NEXT: pmullw %xmm0, %xmm2
363-
; SSE41-NEXT: psrlw $8, %xmm2
364-
; SSE41-NEXT: packuswb %xmm3, %xmm2
354+
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[9,1,2,7,4,12,11,3,8,0,14,6,5,13,10,15]
355+
; SSE41-NEXT: pxor %xmm2, %xmm2
356+
; SSE41-NEXT: pcmpgtb %xmm1, %xmm2
357+
; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,512,2048,4096,256,16384,8192,512]
358+
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
365359
; SSE41-NEXT: paddb %xmm1, %xmm2
366-
; SSE41-NEXT: movdqa %xmm2, %xmm0
367-
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
368-
; SSE41-NEXT: psraw $8, %xmm0
369-
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
370-
; SSE41-NEXT: pmullw %xmm3, %xmm0
371-
; SSE41-NEXT: psrlw $8, %xmm0
372-
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
373-
; SSE41-NEXT: psraw $8, %xmm2
374-
; SSE41-NEXT: pmullw %xmm3, %xmm2
375-
; SSE41-NEXT: psrlw $8, %xmm2
376-
; SSE41-NEXT: packuswb %xmm0, %xmm2
377-
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
378-
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
379-
; SSE41-NEXT: movdqa %xmm1, %xmm0
360+
; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [16384,32768,8192,4096,256,1024,2048,32768]
361+
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
362+
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [32,32,64,64,16,16,8,8,u,u,2,2,4,4,64,64]
363+
; SSE41-NEXT: pxor %xmm1, %xmm2
364+
; SSE41-NEXT: psubb %xmm1, %xmm2
365+
; SSE41-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[1,2,7,4,12,11,3],zero,xmm2[0,14,6,5,13,10,15]
366+
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,zero,zero
367+
; SSE41-NEXT: por %xmm2, %xmm0
380368
; SSE41-NEXT: retq
381369
;
382370
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
@@ -2184,39 +2172,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
21842172
; SSE41-LABEL: non_splat_minus_one_divisor_1:
21852173
; SSE41: # %bb.0:
21862174
; SSE41-NEXT: movdqa %xmm0, %xmm1
2187-
; SSE41-NEXT: pxor %xmm0, %xmm0
2188-
; SSE41-NEXT: pxor %xmm3, %xmm3
2189-
; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
2190-
; SSE41-NEXT: pxor %xmm4, %xmm4
2191-
; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2192-
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2193-
; SSE41-NEXT: psllw $1, %xmm2
2194-
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
2195-
; SSE41-NEXT: psrlw $8, %xmm2
2196-
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2197-
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
2198-
; SSE41-NEXT: psrlw $8, %xmm3
2199-
; SSE41-NEXT: packuswb %xmm3, %xmm2
2175+
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,6,4,5,3,7,12,9,10,11,15,13,14,8]
2176+
; SSE41-NEXT: pxor %xmm2, %xmm2
2177+
; SSE41-NEXT: pcmpgtb %xmm1, %xmm2
2178+
; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,512,256,256,512,512,32768,512]
2179+
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
22002180
; SSE41-NEXT: paddb %xmm1, %xmm2
2201-
; SSE41-NEXT: movdqa %xmm2, %xmm0
2202-
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2203-
; SSE41-NEXT: psraw $8, %xmm0
2204-
; SSE41-NEXT: movdqa %xmm0, %xmm3
2205-
; SSE41-NEXT: psllw $1, %xmm3
2206-
; SSE41-NEXT: psllw $7, %xmm0
2207-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
2208-
; SSE41-NEXT: psrlw $8, %xmm0
2209-
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2210-
; SSE41-NEXT: psraw $8, %xmm2
2211-
; SSE41-NEXT: psllw $7, %xmm2
2212-
; SSE41-NEXT: psrlw $8, %xmm2
2213-
; SSE41-NEXT: packuswb %xmm0, %xmm2
2214-
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2215-
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
2216-
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2217-
; SSE41-NEXT: pxor %xmm0, %xmm1
2218-
; SSE41-NEXT: psubb %xmm0, %xmm1
2219-
; SSE41-NEXT: movdqa %xmm1, %xmm0
2181+
; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,32768,256,256,32768,32768,512,32768]
2182+
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2183+
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,64,64,u,u,u,u,64,64,64,64,1,1,64,u]
2184+
; SSE41-NEXT: pxor %xmm1, %xmm2
2185+
; SSE41-NEXT: psubb %xmm1, %xmm2
2186+
; SSE41-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,xmm2[9,10,11,8,13,14,12]
2187+
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5],zero,xmm0[7,8],zero,zero,zero,zero,zero,zero,zero
2188+
; SSE41-NEXT: por %xmm2, %xmm0
2189+
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2190+
; SSE41-NEXT: pxor %xmm1, %xmm0
2191+
; SSE41-NEXT: psubb %xmm1, %xmm0
22202192
; SSE41-NEXT: retq
22212193
;
22222194
; AVX1-LABEL: non_splat_minus_one_divisor_1:
@@ -2253,25 +2225,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
22532225
;
22542226
; AVX2-LABEL: non_splat_minus_one_divisor_1:
22552227
; AVX2: # %bb.0:
2256-
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
2257-
; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
2258-
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2259-
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,2,256,256,256,2,256,256,2,2,2,2,128,2,128]
2260-
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2261-
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2262-
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2263-
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1
2264-
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
2265-
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,128,256,256,256,128,256,256,128,128,128,128,2,128,2]
2266-
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2267-
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2268-
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2269-
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2270-
; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2228+
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,8,2,6,4,5,3,7,12,9,10,11,15,13,0,1]
2229+
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2230+
; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm2
2231+
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2232+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2233+
; AVX2-NEXT: vpaddb %xmm2, %xmm1, %xmm1
2234+
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2235+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2236+
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64,64,64,1,1,0,0,64,64,64,64,1,1,0,0]
2237+
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
2238+
; AVX2-NEXT: vpsubb %xmm2, %xmm1, %xmm1
2239+
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,xmm1[9,10,11,8,13,0,12]
2240+
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5],zero,xmm0[7,8],zero,zero,zero,zero,zero,zero,zero
2241+
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
22712242
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
22722243
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
22732244
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
2274-
; AVX2-NEXT: vzeroupper
22752245
; AVX2-NEXT: retq
22762246
;
22772247
; AVX512F-LABEL: non_splat_minus_one_divisor_1:

llvm/test/CodeGen/X86/vector-fshr-128.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2010,13 +2010,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
20102010
; SSE41-NEXT: psrlw $8, %xmm3
20112011
; SSE41-NEXT: packuswb %xmm1, %xmm3
20122012
; SSE41-NEXT: paddb %xmm0, %xmm0
2013-
; SSE41-NEXT: movdqa %xmm0, %xmm1
2014-
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
2015-
; SSE41-NEXT: psllw $8, %xmm1
2016-
; SSE41-NEXT: por %xmm3, %xmm1
2017-
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
2013+
; SSE41-NEXT: pshufb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,8,13,3,11,5,9,7,10,6,12,4,14,2,1,15]
2014+
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,16,4,1,2,8,32,64]
2015+
; SSE41-NEXT: pshufb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,14,13,3,11,5,9,7,1,6,8,4,10,2,12,15]
20182016
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2019-
; SSE41-NEXT: por %xmm1, %xmm0
2017+
; SSE41-NEXT: por %xmm3, %xmm0
20202018
; SSE41-NEXT: retq
20212019
;
20222020
; AVX1-LABEL: constant_funnnel_v16i8:

llvm/test/CodeGen/X86/vector-mul.ll

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -262,22 +262,20 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
262262
;
263263
; X86-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
264264
; X86-SSE4: # %bb.0:
265-
; X86-SSE4-NEXT: movdqa %xmm0, %xmm1
266-
; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
267-
; X86-SSE4-NEXT: psllw $8, %xmm1
268-
; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
265+
; X86-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [0,4,2,6,1,5,3,7,8,12,10,14,9,13,11,15]
266+
; X86-SSE4-NEXT: pshufb %xmm1, %xmm0
267+
; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,4,2,8,1,4,2,8]
268+
; X86-SSE4-NEXT: pshufb %xmm1, %xmm0
269269
; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
270-
; X86-SSE4-NEXT: por %xmm1, %xmm0
271270
; X86-SSE4-NEXT: retl
272271
;
273272
; X64-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
274273
; X64-SSE4: # %bb.0:
275-
; X64-SSE4-NEXT: movdqa %xmm0, %xmm1
276-
; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
277-
; X64-SSE4-NEXT: psllw $8, %xmm1
278-
; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
274+
; X64-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [0,4,2,6,1,5,3,7,8,12,10,14,9,13,11,15]
275+
; X64-SSE4-NEXT: pshufb %xmm1, %xmm0
276+
; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,2,8,1,4,2,8]
277+
; X64-SSE4-NEXT: pshufb %xmm1, %xmm0
279278
; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
280-
; X64-SSE4-NEXT: por %xmm1, %xmm0
281279
; X64-SSE4-NEXT: retq
282280
;
283281
; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
@@ -287,12 +285,11 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
287285
;
288286
; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
289287
; X64-AVX2: # %bb.0:
290-
; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
291-
; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
292-
; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
293-
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
294-
; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
295-
; X64-AVX2-NEXT: vzeroupper
288+
; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
289+
; X64-AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
290+
; X64-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
291+
; X64-AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
292+
; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
296293
; X64-AVX2-NEXT: retq
297294
;
298295
; X64-AVX512DQ-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:

llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2042,12 +2042,13 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
20422042
;
20432043
; AVX2-LABEL: constant_shift_v4i8:
20442044
; AVX2: # %bb.0:
2045-
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
2046-
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
2047-
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
2048-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2049-
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2050-
; AVX2-NEXT: vzeroupper
2045+
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,8,9,1,15,6,7,2,12,10,11,3,13,4,5]
2046+
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2047+
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,14,15,6,7,2,3,10,11,9,13,1,5]
2048+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2049+
; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [128,64,32,16,0,0,0,0,0,0,0,0,0,0,0,0]
2050+
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
2051+
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
20512052
; AVX2-NEXT: retq
20522053
;
20532054
; XOP-LABEL: constant_shift_v4i8:
@@ -2105,17 +2106,29 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
21052106
}
21062107

21072108
define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
2108-
; SSE-LABEL: constant_shift_v2i8:
2109-
; SSE: # %bb.0:
2110-
; SSE-NEXT: pxor %xmm1, %xmm1
2111-
; SSE-NEXT: movdqa %xmm0, %xmm2
2112-
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2113-
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2114-
; SSE-NEXT: psraw $8, %xmm0
2115-
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
2116-
; SSE-NEXT: psrlw $8, %xmm0
2117-
; SSE-NEXT: packuswb %xmm2, %xmm0
2118-
; SSE-NEXT: retq
2109+
; SSE2-LABEL: constant_shift_v2i8:
2110+
; SSE2: # %bb.0:
2111+
; SSE2-NEXT: pxor %xmm1, %xmm1
2112+
; SSE2-NEXT: movdqa %xmm0, %xmm2
2113+
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2114+
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2115+
; SSE2-NEXT: psraw $8, %xmm0
2116+
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
2117+
; SSE2-NEXT: psrlw $8, %xmm0
2118+
; SSE2-NEXT: packuswb %xmm2, %xmm0
2119+
; SSE2-NEXT: retq
2120+
;
2121+
; SSE41-LABEL: constant_shift_v2i8:
2122+
; SSE41: # %bb.0:
2123+
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15]
2124+
; SSE41-NEXT: pshufb %xmm1, %xmm0
2125+
; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,256,256,256,256,256,256,8192]
2126+
; SSE41-NEXT: pshufb %xmm1, %xmm0
2127+
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2128+
; SSE41-NEXT: movd {{.*#+}} xmm1 = [32,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2129+
; SSE41-NEXT: pxor %xmm1, %xmm0
2130+
; SSE41-NEXT: psubb %xmm1, %xmm0
2131+
; SSE41-NEXT: retq
21192132
;
21202133
; AVX1-LABEL: constant_shift_v2i8:
21212134
; AVX1: # %bb.0:
@@ -2130,12 +2143,14 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
21302143
;
21312144
; AVX2-LABEL: constant_shift_v2i8:
21322145
; AVX2: # %bb.0:
2133-
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
2134-
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
2135-
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
2136-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2137-
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
2138-
; AVX2-NEXT: vzeroupper
2146+
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15]
2147+
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2148+
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2149+
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2150+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2151+
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32,16,16,0,32,0,16,0,32,0,16,0,32,0,0,0]
2152+
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
2153+
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
21392154
; AVX2-NEXT: retq
21402155
;
21412156
; XOP-LABEL: constant_shift_v2i8:

0 commit comments

Comments
 (0)