Skip to content

Commit b937669

Browse files
committed
[X86] Improve lowering of v2i64 sign bit tests on pre-sse4.2 targets
Without sse4.2 a v2i64 setlt needs to expand into a pcmpgtd, pcmpeqd, 3 shuffles, and 2 logic ops. But if we're only interested in the sign bit of the i64 elements, we can just use one pcmpgtd and shuffle the odd elements to the even elements. Differential Revision: https://reviews.llvm.org/D72302
1 parent 2f1e5d9 commit b937669

File tree

6 files changed

+963
-1456
lines changed

6 files changed

+963
-1456
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21584,6 +21584,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
2158421584
if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
2158521585
assert(Subtarget.hasSSE2() && "Don't know how to lower!");
2158621586

21587+
// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
21588+
// the odd elements over the even elements.
21589+
if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
21590+
Op0 = DAG.getConstant(0, dl, MVT::v4i32);
21591+
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
21592+
21593+
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
21594+
static const int MaskHi[] = { 1, 1, 3, 3 };
21595+
SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
21596+
21597+
return DAG.getBitcast(VT, Result);
21598+
}
21599+
2158721600
// Since SSE has no unsigned integer comparisons, we need to flip the sign
2158821601
// bits of the inputs before performing those operations. The lower
2158921602
// compare is always unsigned.

llvm/test/CodeGen/X86/bitcast-vector-bool.ll

Lines changed: 5 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -429,48 +429,11 @@ define i16 @bitcast_v32i8_to_v2i16(<32 x i8> %a0) nounwind {
429429
define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
430430
; SSE2-SSSE3-LABEL: bitcast_v8i64_to_v2i4:
431431
; SSE2-SSSE3: # %bb.0:
432-
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
433-
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3
434-
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5
435-
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
436-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
437-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
438-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
439-
; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3
440-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
441-
; SSE2-SSSE3-NEXT: por %xmm3, %xmm5
442-
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2
443-
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3
444-
; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
445-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
446-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
447-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
448-
; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7
449-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
450-
; SSE2-SSSE3-NEXT: por %xmm7, %xmm2
451-
; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm2
452-
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
453-
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3
454-
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
455-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
456-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
457-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
458-
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
459-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
460-
; SSE2-SSSE3-NEXT: por %xmm1, %xmm3
461-
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
462-
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1
463-
; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
464-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
465-
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm0
466-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
467-
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0
468-
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
469-
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
470-
; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm1
471-
; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm1
472-
; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm1
473-
; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %eax
432+
; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
433+
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
434+
; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0
435+
; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
436+
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
474437
; SSE2-SSSE3-NEXT: movzbl %al, %ecx
475438
; SSE2-SSSE3-NEXT: shrl $4, %ecx
476439
; SSE2-SSSE3-NEXT: movq %rcx, %xmm0

llvm/test/CodeGen/X86/movmsk-cmp.ll

Lines changed: 10 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,48 +1015,11 @@ define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
10151015
define i1 @allones_v8i64_sign(<8 x i64> %arg) {
10161016
; SSE2-LABEL: allones_v8i64_sign:
10171017
; SSE2: # %bb.0:
1018-
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
1019-
; SSE2-NEXT: pxor %xmm4, %xmm3
1020-
; SSE2-NEXT: movdqa %xmm4, %xmm5
1021-
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1022-
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
1023-
; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
1024-
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1025-
; SSE2-NEXT: pand %xmm6, %xmm3
1026-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1027-
; SSE2-NEXT: por %xmm3, %xmm5
1028-
; SSE2-NEXT: pxor %xmm4, %xmm2
1029-
; SSE2-NEXT: movdqa %xmm4, %xmm3
1030-
; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
1031-
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
1032-
; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
1033-
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
1034-
; SSE2-NEXT: pand %xmm6, %xmm7
1035-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1036-
; SSE2-NEXT: por %xmm7, %xmm2
1037-
; SSE2-NEXT: packssdw %xmm5, %xmm2
1038-
; SSE2-NEXT: pxor %xmm4, %xmm1
1039-
; SSE2-NEXT: movdqa %xmm4, %xmm3
1040-
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
1041-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
1042-
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
1043-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1044-
; SSE2-NEXT: pand %xmm5, %xmm1
1045-
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1046-
; SSE2-NEXT: por %xmm1, %xmm3
1047-
; SSE2-NEXT: pxor %xmm4, %xmm0
1048-
; SSE2-NEXT: movdqa %xmm4, %xmm1
1049-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
1050-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
1051-
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
1052-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1053-
; SSE2-NEXT: pand %xmm5, %xmm0
1054-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1055-
; SSE2-NEXT: por %xmm0, %xmm1
1056-
; SSE2-NEXT: packssdw %xmm3, %xmm1
1057-
; SSE2-NEXT: packssdw %xmm2, %xmm1
1058-
; SSE2-NEXT: packsswb %xmm0, %xmm1
1059-
; SSE2-NEXT: pmovmskb %xmm1, %eax
1018+
; SSE2-NEXT: packssdw %xmm3, %xmm2
1019+
; SSE2-NEXT: packssdw %xmm1, %xmm0
1020+
; SSE2-NEXT: packssdw %xmm2, %xmm0
1021+
; SSE2-NEXT: packsswb %xmm0, %xmm0
1022+
; SSE2-NEXT: pmovmskb %xmm0, %eax
10601023
; SSE2-NEXT: cmpb $-1, %al
10611024
; SSE2-NEXT: sete %al
10621025
; SSE2-NEXT: retq
@@ -1113,48 +1076,11 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) {
11131076
define i1 @allzeros_v8i64_sign(<8 x i64> %arg) {
11141077
; SSE2-LABEL: allzeros_v8i64_sign:
11151078
; SSE2: # %bb.0:
1116-
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
1117-
; SSE2-NEXT: pxor %xmm4, %xmm3
1118-
; SSE2-NEXT: movdqa %xmm4, %xmm5
1119-
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
1120-
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
1121-
; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
1122-
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1123-
; SSE2-NEXT: pand %xmm6, %xmm3
1124-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1125-
; SSE2-NEXT: por %xmm3, %xmm5
1126-
; SSE2-NEXT: pxor %xmm4, %xmm2
1127-
; SSE2-NEXT: movdqa %xmm4, %xmm3
1128-
; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
1129-
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
1130-
; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
1131-
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
1132-
; SSE2-NEXT: pand %xmm6, %xmm7
1133-
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1134-
; SSE2-NEXT: por %xmm7, %xmm2
1135-
; SSE2-NEXT: packssdw %xmm5, %xmm2
1136-
; SSE2-NEXT: pxor %xmm4, %xmm1
1137-
; SSE2-NEXT: movdqa %xmm4, %xmm3
1138-
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
1139-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
1140-
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
1141-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1142-
; SSE2-NEXT: pand %xmm5, %xmm1
1143-
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1144-
; SSE2-NEXT: por %xmm1, %xmm3
1145-
; SSE2-NEXT: pxor %xmm4, %xmm0
1146-
; SSE2-NEXT: movdqa %xmm4, %xmm1
1147-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
1148-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
1149-
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
1150-
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1151-
; SSE2-NEXT: pand %xmm5, %xmm0
1152-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1153-
; SSE2-NEXT: por %xmm0, %xmm1
1154-
; SSE2-NEXT: packssdw %xmm3, %xmm1
1155-
; SSE2-NEXT: packssdw %xmm2, %xmm1
1156-
; SSE2-NEXT: packsswb %xmm0, %xmm1
1157-
; SSE2-NEXT: pmovmskb %xmm1, %eax
1079+
; SSE2-NEXT: packssdw %xmm3, %xmm2
1080+
; SSE2-NEXT: packssdw %xmm1, %xmm0
1081+
; SSE2-NEXT: packssdw %xmm2, %xmm0
1082+
; SSE2-NEXT: packsswb %xmm0, %xmm0
1083+
; SSE2-NEXT: pmovmskb %xmm0, %eax
11581084
; SSE2-NEXT: testb %al, %al
11591085
; SSE2-NEXT: sete %al
11601086
; SSE2-NEXT: retq

0 commit comments

Comments
 (0)