Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 34f6b25

Browse files
committed
[X86] Remove isel patterns for MOVSS/MOVSD ISD opcodes with integer types.
Ideally our ISD node types going into the isel table would have types consistent with their instruction domain. This prevents us having to duplicate patterns with different types for the same instruction. Unfortunately, it seems our shuffle combining is currently relying on this a little remove some bitcasts. This seems to enable some switching between shufps and shufd. Hopefully there's some way we can address this in the combining. Differential Revision: https://reviews.llvm.org/D49280 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@337590 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent a7a130c commit 34f6b25

File tree

8 files changed

+83
-153
lines changed

8 files changed

+83
-153
lines changed

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -29400,13 +29400,13 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
2940029400
(AllowFloatDomain || !Subtarget.hasSSE41())) {
2940129401
std::swap(V1, V2);
2940229402
Shuffle = X86ISD::MOVSD;
29403-
SrcVT = DstVT = MaskVT;
29403+
SrcVT = DstVT = MVT::v2f64;
2940429404
return true;
2940529405
}
2940629406
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
2940729407
(AllowFloatDomain || !Subtarget.hasSSE41())) {
2940829408
Shuffle = X86ISD::MOVSS;
29409-
SrcVT = DstVT = MaskVT;
29409+
SrcVT = DstVT = MVT::v4f32;
2941029410
return true;
2941129411
}
2941229412
}
@@ -30715,28 +30715,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
3071530715
}
3071630716
}
3071730717

30718-
SDValue V0 = peekThroughBitcasts(N0);
30719-
SDValue V1 = peekThroughBitcasts(N1);
30720-
bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
30721-
bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
30722-
if (isZero0 && isZero1)
30723-
return SDValue();
30724-
30725-
// We often lower to MOVSD/MOVSS from integer as well as native float
30726-
// types; remove unnecessary domain-crossing bitcasts if we can to make it
30727-
// easier to combine shuffles later on. We've already accounted for the
30728-
// domain switching cost when we decided to lower with it.
30729-
bool isFloat = VT.isFloatingPoint();
30730-
bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
30731-
bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
30732-
if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
30733-
MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
30734-
: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
30735-
V0 = DAG.getBitcast(NewVT, V0);
30736-
V1 = DAG.getBitcast(NewVT, V1);
30737-
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
30738-
}
30739-
3074030718
return SDValue();
3074130719
}
3074230720
case X86ISD::INSERTPS: {

lib/Target/X86/X86InstrAVX512.td

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4468,16 +4468,6 @@ let Predicates = [HasAVX512] in {
44684468
(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
44694469
}
44704470

4471-
let Predicates = [HasAVX512, OptForSize] in {
4472-
// Shuffle with VMOVSS
4473-
def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
4474-
(VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>;
4475-
4476-
// Shuffle with VMOVSD
4477-
def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
4478-
(VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
4479-
}
4480-
44814471
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
44824472
def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
44834473
(ins VR128X:$src),

lib/Target/X86/X86InstrFragmentsSIMD.td

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,8 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI",
281281
def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
282282
def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
283283
SDTCisSameAs<0,2>]>;
284+
def SDTShuff2OpFP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
285+
SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>;
284286

285287
def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
286288
SDTCisFP<0>, SDTCisInt<2>,
@@ -368,11 +370,11 @@ def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
368370
def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
369371
def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
370372

371-
def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
372-
def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
373+
def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>;
374+
def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>;
373375

374-
def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
375-
def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
376+
def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>;
377+
def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>;
376378

377379
def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
378380
SDTCisVec<1>, SDTCisInt<1>,

lib/Target/X86/X86InstrSSE.td

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -317,14 +317,6 @@ let Predicates = [UseAVX, OptForSize] in {
317317
(v2i64 (VMOVSDrr (v2i64 (V_SET0)),
318318
(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
319319
sub_xmm)>;
320-
321-
// Shuffle with VMOVSS
322-
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
323-
(VMOVSSrr VR128:$src1, VR128:$src2)>;
324-
325-
// Shuffle with VMOVSD
326-
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
327-
(VMOVSDrr VR128:$src1, VR128:$src2)>;
328320
}
329321

330322
let Predicates = [UseSSE1] in {
@@ -335,9 +327,6 @@ let Predicates = [UseSSE1] in {
335327
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
336328
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
337329
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
338-
// Shuffle with MOVSS
339-
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
340-
(MOVSSrr VR128:$src1, VR128:$src2)>;
341330
}
342331

343332
// MOVSSrm already zeros the high parts of the register.
@@ -364,12 +353,6 @@ let Predicates = [UseSSE2] in {
364353
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
365354
def : Pat<(v2f64 (X86vzload addr:$src)),
366355
(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
367-
368-
let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
369-
// Shuffle with MOVSD
370-
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
371-
(MOVSDrr VR128:$src1, VR128:$src2)>;
372-
}
373356
}
374357

375358
// Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -6427,25 +6410,13 @@ let Predicates = [HasAVX, OptForSpeed] in {
64276410
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
64286411
def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
64296412
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6430-
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
6431-
(VPBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>;
6432-
def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
6433-
(VPBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>;
6434-
def : Pat<(v4i32 (X86Movss (bc_v4i32 (loadv2i64 addr:$src2)), VR128:$src1)),
6435-
(VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>;
64366413

64376414
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
64386415
(VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
64396416
def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
64406417
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
64416418
def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
64426419
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6443-
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
6444-
(VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>;
6445-
def : Pat<(v2i64 (X86Movsd VR128:$src1, (loadv2i64 addr:$src2))),
6446-
(VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>;
6447-
def : Pat<(v2i64 (X86Movsd (loadv2i64 addr:$src2), VR128:$src1)),
6448-
(VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>;
64496420

64506421
// Move low f32 and clear high bits.
64516422
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
@@ -6487,25 +6458,13 @@ let Predicates = [UseSSE41, OptForSpeed] in {
64876458
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
64886459
def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
64896460
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6490-
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
6491-
(PBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>;
6492-
def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
6493-
(PBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>;
6494-
def : Pat<(v4i32 (X86Movss (bc_v4i32 (memopv2i64 addr:$src2)), VR128:$src1)),
6495-
(PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>;
64966461

64976462
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
64986463
(BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
64996464
def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
65006465
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
65016466
def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
65026467
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6503-
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
6504-
(PBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>;
6505-
def : Pat<(v2i64 (X86Movsd VR128:$src1, (memopv2i64 addr:$src2))),
6506-
(PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>;
6507-
def : Pat<(v2i64 (X86Movsd (memopv2i64 addr:$src2), VR128:$src1)),
6508-
(PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>;
65096468
}
65106469

65116470

test/CodeGen/X86/oddshuffles.ll

Lines changed: 35 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,46 +1277,44 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
12771277
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
12781278
; SSE2-LABEL: interleave_24i32_out:
12791279
; SSE2: # %bb.0:
1280-
; SSE2-NEXT: movups 80(%rdi), %xmm5
1281-
; SSE2-NEXT: movups 64(%rdi), %xmm8
1280+
; SSE2-NEXT: movups 80(%rdi), %xmm9
1281+
; SSE2-NEXT: movups 64(%rdi), %xmm10
12821282
; SSE2-NEXT: movups (%rdi), %xmm0
1283-
; SSE2-NEXT: movups 16(%rdi), %xmm6
1284-
; SSE2-NEXT: movups 32(%rdi), %xmm2
1285-
; SSE2-NEXT: movups 48(%rdi), %xmm1
1286-
; SSE2-NEXT: movaps %xmm1, %xmm3
1287-
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3]
1288-
; SSE2-NEXT: movaps %xmm5, %xmm4
1289-
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0]
1290-
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
1291-
; SSE2-NEXT: movaps %xmm0, %xmm4
1292-
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3]
1293-
; SSE2-NEXT: movaps %xmm2, %xmm7
1294-
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0]
1295-
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0]
1296-
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
1297-
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
1298-
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3]
1299-
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3]
1300-
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0]
1301-
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1302-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1303-
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
1304-
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3]
1305-
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3]
1306-
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
1307-
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
1308-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
1309-
; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
1310-
; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
1311-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
1312-
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1313-
; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
1283+
; SSE2-NEXT: movups 16(%rdi), %xmm11
1284+
; SSE2-NEXT: movups 32(%rdi), %xmm8
1285+
; SSE2-NEXT: movups 48(%rdi), %xmm2
1286+
; SSE2-NEXT: movaps %xmm2, %xmm3
1287+
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3]
1288+
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
1289+
; SSE2-NEXT: movaps %xmm9, %xmm6
1290+
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3]
1291+
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
1292+
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3]
1293+
; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0]
1294+
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0]
1295+
; SSE2-NEXT: movaps %xmm0, %xmm5
1296+
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3]
1297+
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1298+
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3]
1299+
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
1300+
; SSE2-NEXT: movaps %xmm8, %xmm4
1301+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
1302+
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0]
1303+
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
1304+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
1305+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3]
1306+
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0]
1307+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
1308+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
1309+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3]
1310+
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0]
1311+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0]
13141312
; SSE2-NEXT: movups %xmm3, 16(%rsi)
1315-
; SSE2-NEXT: movups %xmm4, (%rsi)
1316-
; SSE2-NEXT: movups %xmm1, 16(%rdx)
1313+
; SSE2-NEXT: movups %xmm5, (%rsi)
1314+
; SSE2-NEXT: movups %xmm2, 16(%rdx)
13171315
; SSE2-NEXT: movups %xmm0, (%rdx)
1318-
; SSE2-NEXT: movupd %xmm7, 16(%rcx)
1319-
; SSE2-NEXT: movupd %xmm9, (%rcx)
1316+
; SSE2-NEXT: movups %xmm7, 16(%rcx)
1317+
; SSE2-NEXT: movups %xmm1, (%rcx)
13201318
; SSE2-NEXT: retq
13211319
;
13221320
; SSE42-LABEL: interleave_24i32_out:

test/CodeGen/X86/vector-shift-ashr-128.ll

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,16 +1233,17 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
12331233
; SSE2-NEXT: movdqa %xmm0, %xmm1
12341234
; SSE2-NEXT: psraw $4, %xmm1
12351235
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1236-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1236+
; SSE2-NEXT: movapd %xmm1, %xmm2
1237+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
12371238
; SSE2-NEXT: psraw $2, %xmm1
12381239
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
1239-
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1240-
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
1241-
; SSE2-NEXT: movdqa %xmm2, %xmm1
1242-
; SSE2-NEXT: pand %xmm0, %xmm1
1240+
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1241+
; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
1242+
; SSE2-NEXT: movaps %xmm2, %xmm0
1243+
; SSE2-NEXT: andps %xmm1, %xmm0
12431244
; SSE2-NEXT: psraw $1, %xmm2
1244-
; SSE2-NEXT: pandn %xmm2, %xmm0
1245-
; SSE2-NEXT: por %xmm1, %xmm0
1245+
; SSE2-NEXT: andnps %xmm2, %xmm1
1246+
; SSE2-NEXT: orps %xmm1, %xmm0
12461247
; SSE2-NEXT: retq
12471248
;
12481249
; SSE41-LABEL: constant_shift_v8i16:
@@ -1318,16 +1319,17 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
13181319
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
13191320
; X32-SSE-NEXT: psraw $4, %xmm1
13201321
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1321-
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1322+
; X32-SSE-NEXT: movapd %xmm1, %xmm2
1323+
; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
13221324
; X32-SSE-NEXT: psraw $2, %xmm1
13231325
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
1324-
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1325-
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
1326-
; X32-SSE-NEXT: movdqa %xmm2, %xmm1
1327-
; X32-SSE-NEXT: pand %xmm0, %xmm1
1326+
; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1327+
; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
1328+
; X32-SSE-NEXT: movaps %xmm2, %xmm0
1329+
; X32-SSE-NEXT: andps %xmm1, %xmm0
13281330
; X32-SSE-NEXT: psraw $1, %xmm2
1329-
; X32-SSE-NEXT: pandn %xmm2, %xmm0
1330-
; X32-SSE-NEXT: por %xmm1, %xmm0
1331+
; X32-SSE-NEXT: andnps %xmm2, %xmm1
1332+
; X32-SSE-NEXT: orps %xmm1, %xmm0
13311333
; X32-SSE-NEXT: retl
13321334
%shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
13331335
ret <8 x i16> %shift

test/CodeGen/X86/vector-shift-lshr-128.ll

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -993,16 +993,17 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
993993
; SSE2-NEXT: movdqa %xmm0, %xmm1
994994
; SSE2-NEXT: psrlw $4, %xmm1
995995
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
996-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
996+
; SSE2-NEXT: movapd %xmm1, %xmm2
997+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
997998
; SSE2-NEXT: psrlw $2, %xmm1
998999
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
999-
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1000-
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
1001-
; SSE2-NEXT: movdqa %xmm2, %xmm1
1002-
; SSE2-NEXT: pand %xmm0, %xmm1
1000+
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1001+
; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
1002+
; SSE2-NEXT: movaps %xmm2, %xmm0
1003+
; SSE2-NEXT: andps %xmm1, %xmm0
10031004
; SSE2-NEXT: psrlw $1, %xmm2
1004-
; SSE2-NEXT: pandn %xmm2, %xmm0
1005-
; SSE2-NEXT: por %xmm1, %xmm0
1005+
; SSE2-NEXT: andnps %xmm2, %xmm1
1006+
; SSE2-NEXT: orps %xmm1, %xmm0
10061007
; SSE2-NEXT: retq
10071008
;
10081009
; SSE41-LABEL: constant_shift_v8i16:
@@ -1079,16 +1080,17 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
10791080
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
10801081
; X32-SSE-NEXT: psrlw $4, %xmm1
10811082
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1082-
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1083+
; X32-SSE-NEXT: movapd %xmm1, %xmm2
1084+
; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
10831085
; X32-SSE-NEXT: psrlw $2, %xmm1
10841086
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
1085-
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1086-
; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
1087-
; X32-SSE-NEXT: movdqa %xmm2, %xmm1
1088-
; X32-SSE-NEXT: pand %xmm0, %xmm1
1087+
; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1088+
; X32-SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
1089+
; X32-SSE-NEXT: movaps %xmm2, %xmm0
1090+
; X32-SSE-NEXT: andps %xmm1, %xmm0
10891091
; X32-SSE-NEXT: psrlw $1, %xmm2
1090-
; X32-SSE-NEXT: pandn %xmm2, %xmm0
1091-
; X32-SSE-NEXT: por %xmm1, %xmm0
1092+
; X32-SSE-NEXT: andnps %xmm2, %xmm1
1093+
; X32-SSE-NEXT: orps %xmm1, %xmm0
10921094
; X32-SSE-NEXT: retl
10931095
%shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
10941096
ret <8 x i16> %shift

test/CodeGen/X86/vector-shuffle-128-v8.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,8 +1248,8 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
12481248
; SSE2-LABEL: shuffle_v8i16_032dXXXX:
12491249
; SSE2: # %bb.0:
12501250
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1251-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
1252-
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1251+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,0]
1252+
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,7]
12531253
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
12541254
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
12551255
; SSE2-NEXT: retq
@@ -1403,8 +1403,8 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
14031403
; SSE2-LABEL: shuffle_v8i16_012dcde3:
14041404
; SSE2: # %bb.0:
14051405
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1406-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1]
1407-
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
1406+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3,2,1]
1407+
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,1,2,0,4,5,6,7]
14081408
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
14091409
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
14101410
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
@@ -1542,11 +1542,10 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
15421542
define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
15431543
; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
15441544
; SSE2: # %bb.0:
1545-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
1546-
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
1547-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1548-
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
1549-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1545+
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1546+
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1547+
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
1548+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3]
15501549
; SSE2-NEXT: retq
15511550
;
15521551
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:

0 commit comments

Comments
 (0)