Skip to content

Commit c43ac32

Browse files
authored
[DAG] Expand vXi1 add/sub overflow operations as xor/and (#69191)
Similar to what we already do for add/sub + saturation variants. Scalar support will be added in a future patch covering the other variants at the same time. Alive2: https://alive2.llvm.org/ce/z/rBDrNE Fixes #69080
1 parent 4ed0dfe commit c43ac32

File tree

7 files changed

+132
-144
lines changed

7 files changed

+132
-144
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9873,6 +9873,27 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
98739873
SDValue ZeroOverFlow = getConstant(0, DL, VTList.VTs[1]);
98749874
return getNode(ISD::MERGE_VALUES, DL, VTList, {N1, ZeroOverFlow}, Flags);
98759875
}
9876+
9877+
if (VTList.VTs[0].isVector() &&
9878+
VTList.VTs[0].getVectorElementType() == MVT::i1 &&
9879+
VTList.VTs[1].getVectorElementType() == MVT::i1) {
9880+
SDValue F1 = getFreeze(N1);
9881+
SDValue F2 = getFreeze(N2);
9882+
// {vXi1,vXi1} (u/s)addo(vXi1 x, vXi1y) -> {xor(x,y),and(x,y)}
9883+
if (Opcode == ISD::UADDO || Opcode == ISD::SADDO)
9884+
return getNode(ISD::MERGE_VALUES, DL, VTList,
9885+
{getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2),
9886+
getNode(ISD::AND, DL, VTList.VTs[1], F1, F2)},
9887+
Flags);
9888+
// {vXi1,vXi1} (u/s)subo(vXi1 x, vXi1y) -> {xor(x,y),and(~x,y)}
9889+
if (Opcode == ISD::USUBO || Opcode == ISD::SSUBO) {
9890+
SDValue NotF1 = getNOT(DL, F1, VTList.VTs[0]);
9891+
return getNode(ISD::MERGE_VALUES, DL, VTList,
9892+
{getNode(ISD::XOR, DL, VTList.VTs[0], F1, F2),
9893+
getNode(ISD::AND, DL, VTList.VTs[1], NotF1, F2)},
9894+
Flags);
9895+
}
9896+
}
98769897
break;
98779898
}
98789899
case ISD::SMUL_LOHI:

llvm/test/CodeGen/AArch64/vec_uaddo.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -245,21 +245,17 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
245245
define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
246246
; CHECK-LABEL: uaddo_v4i1:
247247
; CHECK: // %bb.0:
248-
; CHECK-NEXT: movi v2.4h, #1
248+
; CHECK-NEXT: eor v2.8b, v0.8b, v1.8b
249+
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
249250
; CHECK-NEXT: adrp x8, .LCPI10_0
251+
; CHECK-NEXT: shl v2.4h, v2.4h, #15
252+
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
253+
; CHECK-NEXT: cmlt v1.4h, v2.4h, #0
254+
; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0]
255+
; CHECK-NEXT: shl v0.4s, v0.4s, #31
250256
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
251-
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
252-
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
253-
; CHECK-NEXT: fmov d1, d0
254-
; CHECK-NEXT: shl v2.4h, v0.4h, #15
255-
; CHECK-NEXT: cmlt v2.4h, v2.4h, #0
256-
; CHECK-NEXT: bic v1.4h, #2
257-
; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
258-
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
259-
; CHECK-NEXT: and v1.8b, v2.8b, v1.8b
260-
; CHECK-NEXT: mvn v0.8b, v0.8b
257+
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
261258
; CHECK-NEXT: addv h1, v1.4h
262-
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
263259
; CHECK-NEXT: fmov w8, s1
264260
; CHECK-NEXT: strb w8, [x0]
265261
; CHECK-NEXT: ret

llvm/test/CodeGen/X86/pr69080.ll

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=SSE
3+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
4+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX
5+
6+
define { <4 x i1>, <4 x i1> } @uaddo(<4 x i1> %a) {
7+
; SSE-LABEL: uaddo:
8+
; SSE: # %bb.0:
9+
; SSE-NEXT: movaps %xmm0, %xmm1
10+
; SSE-NEXT: xorps %xmm0, %xmm0
11+
; SSE-NEXT: retq
12+
;
13+
; AVX-LABEL: uaddo:
14+
; AVX: # %bb.0:
15+
; AVX-NEXT: vmovaps %xmm0, %xmm1
16+
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
17+
; AVX-NEXT: retq
18+
%f = call { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a)
19+
ret { <4 x i1>, <4 x i1> } %f
20+
}
21+
declare { <4 x i1>, <4 x i1> } @llvm.uadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
22+
23+
define { <4 x i1>, <4 x i1> } @saddo(<4 x i1> %a) {
24+
; SSE-LABEL: saddo:
25+
; SSE: # %bb.0:
26+
; SSE-NEXT: movaps %xmm0, %xmm1
27+
; SSE-NEXT: xorps %xmm0, %xmm0
28+
; SSE-NEXT: retq
29+
;
30+
; AVX-LABEL: saddo:
31+
; AVX: # %bb.0:
32+
; AVX-NEXT: vmovaps %xmm0, %xmm1
33+
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
34+
; AVX-NEXT: retq
35+
%f = call { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1> %a, <4 x i1> %a)
36+
ret { <4 x i1>, <4 x i1> } %f
37+
}
38+
declare { <4 x i1>, <4 x i1> } @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)

llvm/test/CodeGen/X86/vec_saddo.ll

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -976,46 +976,35 @@ define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
976976
define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
977977
; SSE-LABEL: saddo_v4i1:
978978
; SSE: # %bb.0:
979-
; SSE-NEXT: pslld $31, %xmm1
980-
; SSE-NEXT: psrad $31, %xmm1
979+
; SSE-NEXT: movdqa %xmm0, %xmm2
980+
; SSE-NEXT: pxor %xmm1, %xmm2
981+
; SSE-NEXT: pslld $31, %xmm2
982+
; SSE-NEXT: movmskps %xmm2, %eax
983+
; SSE-NEXT: pand %xmm1, %xmm0
981984
; SSE-NEXT: pslld $31, %xmm0
982985
; SSE-NEXT: psrad $31, %xmm0
983-
; SSE-NEXT: paddd %xmm1, %xmm0
984-
; SSE-NEXT: movdqa %xmm0, %xmm1
985-
; SSE-NEXT: pslld $31, %xmm1
986-
; SSE-NEXT: movmskps %xmm1, %eax
987-
; SSE-NEXT: psrad $31, %xmm1
988-
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
989-
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
990-
; SSE-NEXT: pxor %xmm1, %xmm0
991986
; SSE-NEXT: movb %al, (%rdi)
992987
; SSE-NEXT: retq
993988
;
994989
; AVX-LABEL: saddo_v4i1:
995990
; AVX: # %bb.0:
996-
; AVX-NEXT: vpslld $31, %xmm1, %xmm1
997-
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
991+
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
992+
; AVX-NEXT: vpslld $31, %xmm2, %xmm2
993+
; AVX-NEXT: vmovmskps %xmm2, %eax
994+
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
998995
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
999996
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
1000-
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1001-
; AVX-NEXT: vpslld $31, %xmm0, %xmm1
1002-
; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
1003-
; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
1004-
; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1005-
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
1006-
; AVX-NEXT: vmovmskps %xmm1, %eax
1007997
; AVX-NEXT: movb %al, (%rdi)
1008998
; AVX-NEXT: retq
1009999
;
10101000
; AVX512-LABEL: saddo_v4i1:
10111001
; AVX512: # %bb.0:
10121002
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
10131003
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
1014-
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
1015-
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
1004+
; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
1005+
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
10161006
; AVX512-NEXT: kxorw %k1, %k0, %k2
1017-
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2}
1018-
; AVX512-NEXT: kxorw %k0, %k1, %k1
1007+
; AVX512-NEXT: kandw %k1, %k0, %k1
10191008
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
10201009
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
10211010
; AVX512-NEXT: kshiftlw $12, %k2, %k0

llvm/test/CodeGen/X86/vec_ssubo.ll

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -985,34 +985,24 @@ define <4 x i32> @ssubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
985985
define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
986986
; SSE-LABEL: ssubo_v4i1:
987987
; SSE: # %bb.0:
988-
; SSE-NEXT: pslld $31, %xmm1
989-
; SSE-NEXT: psrad $31, %xmm1
988+
; SSE-NEXT: movdqa %xmm0, %xmm2
989+
; SSE-NEXT: pxor %xmm1, %xmm2
990+
; SSE-NEXT: pslld $31, %xmm2
991+
; SSE-NEXT: movmskps %xmm2, %eax
992+
; SSE-NEXT: pandn %xmm1, %xmm0
990993
; SSE-NEXT: pslld $31, %xmm0
991994
; SSE-NEXT: psrad $31, %xmm0
992-
; SSE-NEXT: psubd %xmm1, %xmm0
993-
; SSE-NEXT: movdqa %xmm0, %xmm1
994-
; SSE-NEXT: pslld $31, %xmm1
995-
; SSE-NEXT: movmskps %xmm1, %eax
996-
; SSE-NEXT: psrad $31, %xmm1
997-
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
998-
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
999-
; SSE-NEXT: pxor %xmm1, %xmm0
1000995
; SSE-NEXT: movb %al, (%rdi)
1001996
; SSE-NEXT: retq
1002997
;
1003998
; AVX-LABEL: ssubo_v4i1:
1004999
; AVX: # %bb.0:
1005-
; AVX-NEXT: vpslld $31, %xmm1, %xmm1
1006-
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
1000+
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
1001+
; AVX-NEXT: vpslld $31, %xmm2, %xmm2
1002+
; AVX-NEXT: vmovmskps %xmm2, %eax
1003+
; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
10071004
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
10081005
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
1009-
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
1010-
; AVX-NEXT: vpslld $31, %xmm0, %xmm1
1011-
; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
1012-
; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
1013-
; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1014-
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
1015-
; AVX-NEXT: vmovmskps %xmm1, %eax
10161006
; AVX-NEXT: movb %al, (%rdi)
10171007
; AVX-NEXT: retq
10181008
;
@@ -1022,11 +1012,11 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
10221012
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
10231013
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
10241014
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
1025-
; AVX512-NEXT: kxorw %k1, %k0, %k1
1026-
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
1015+
; AVX512-NEXT: kxorw %k1, %k0, %k0
1016+
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
10271017
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1028-
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
1029-
; AVX512-NEXT: kshiftlw $12, %k1, %k0
1018+
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1019+
; AVX512-NEXT: kshiftlw $12, %k0, %k0
10301020
; AVX512-NEXT: kshiftrw $12, %k0, %k0
10311021
; AVX512-NEXT: kmovd %k0, %eax
10321022
; AVX512-NEXT: movb %al, (%rdi)

llvm/test/CodeGen/X86/vec_uaddo.ll

Lines changed: 20 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,61 +1075,38 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
10751075
define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
10761076
; SSE-LABEL: uaddo_v4i1:
10771077
; SSE: # %bb.0:
1078-
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
1079-
; SSE-NEXT: pand %xmm2, %xmm1
1080-
; SSE-NEXT: pand %xmm2, %xmm0
1081-
; SSE-NEXT: paddd %xmm1, %xmm0
1082-
; SSE-NEXT: pand %xmm0, %xmm2
1083-
; SSE-NEXT: pcmpeqd %xmm0, %xmm2
1084-
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
1085-
; SSE-NEXT: pxor %xmm2, %xmm1
1078+
; SSE-NEXT: movdqa %xmm0, %xmm2
1079+
; SSE-NEXT: pxor %xmm1, %xmm2
1080+
; SSE-NEXT: pslld $31, %xmm2
1081+
; SSE-NEXT: movmskps %xmm2, %eax
1082+
; SSE-NEXT: pand %xmm1, %xmm0
10861083
; SSE-NEXT: pslld $31, %xmm0
1087-
; SSE-NEXT: movmskps %xmm0, %eax
1084+
; SSE-NEXT: psrad $31, %xmm0
10881085
; SSE-NEXT: movb %al, (%rdi)
1089-
; SSE-NEXT: movdqa %xmm1, %xmm0
10901086
; SSE-NEXT: retq
10911087
;
1092-
; AVX1-LABEL: uaddo_v4i1:
1093-
; AVX1: # %bb.0:
1094-
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
1095-
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1096-
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1097-
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1098-
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
1099-
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
1100-
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1101-
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1102-
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
1103-
; AVX1-NEXT: vmovmskps %xmm1, %eax
1104-
; AVX1-NEXT: movb %al, (%rdi)
1105-
; AVX1-NEXT: retq
1106-
;
1107-
; AVX2-LABEL: uaddo_v4i1:
1108-
; AVX2: # %bb.0:
1109-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
1110-
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
1111-
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1112-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1113-
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
1114-
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
1115-
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1116-
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
1117-
; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
1118-
; AVX2-NEXT: vmovmskps %xmm1, %eax
1119-
; AVX2-NEXT: movb %al, (%rdi)
1120-
; AVX2-NEXT: retq
1088+
; AVX-LABEL: uaddo_v4i1:
1089+
; AVX: # %bb.0:
1090+
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
1091+
; AVX-NEXT: vpslld $31, %xmm2, %xmm2
1092+
; AVX-NEXT: vmovmskps %xmm2, %eax
1093+
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
1094+
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
1095+
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
1096+
; AVX-NEXT: movb %al, (%rdi)
1097+
; AVX-NEXT: retq
11211098
;
11221099
; AVX512-LABEL: uaddo_v4i1:
11231100
; AVX512: # %bb.0:
11241101
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
11251102
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
11261103
; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
11271104
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
1128-
; AVX512-NEXT: kxorw %k1, %k0, %k1
1129-
; AVX512-NEXT: kandnw %k0, %k1, %k2
1105+
; AVX512-NEXT: kxorw %k1, %k0, %k2
1106+
; AVX512-NEXT: kandw %k1, %k0, %k1
11301107
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1131-
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
1132-
; AVX512-NEXT: kshiftlw $12, %k1, %k0
1108+
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1109+
; AVX512-NEXT: kshiftlw $12, %k2, %k0
11331110
; AVX512-NEXT: kshiftrw $12, %k0, %k0
11341111
; AVX512-NEXT: kmovd %k0, %eax
11351112
; AVX512-NEXT: movb %al, (%rdi)

llvm/test/CodeGen/X86/vec_usubo.ll

Lines changed: 20 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,61 +1122,38 @@ define <4 x i32> @usubo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
11221122
define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
11231123
; SSE-LABEL: usubo_v4i1:
11241124
; SSE: # %bb.0:
1125-
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
1126-
; SSE-NEXT: pand %xmm2, %xmm1
1127-
; SSE-NEXT: pand %xmm2, %xmm0
1128-
; SSE-NEXT: psubd %xmm1, %xmm0
1129-
; SSE-NEXT: pand %xmm0, %xmm2
1130-
; SSE-NEXT: pcmpeqd %xmm0, %xmm2
1131-
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
1132-
; SSE-NEXT: pxor %xmm2, %xmm1
1125+
; SSE-NEXT: movdqa %xmm0, %xmm2
1126+
; SSE-NEXT: pxor %xmm1, %xmm2
1127+
; SSE-NEXT: pslld $31, %xmm2
1128+
; SSE-NEXT: movmskps %xmm2, %eax
1129+
; SSE-NEXT: pandn %xmm1, %xmm0
11331130
; SSE-NEXT: pslld $31, %xmm0
1134-
; SSE-NEXT: movmskps %xmm0, %eax
1131+
; SSE-NEXT: psrad $31, %xmm0
11351132
; SSE-NEXT: movb %al, (%rdi)
1136-
; SSE-NEXT: movdqa %xmm1, %xmm0
11371133
; SSE-NEXT: retq
11381134
;
1139-
; AVX1-LABEL: usubo_v4i1:
1140-
; AVX1: # %bb.0:
1141-
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
1142-
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1143-
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1144-
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
1145-
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0
1146-
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
1147-
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1148-
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
1149-
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
1150-
; AVX1-NEXT: vmovmskps %xmm1, %eax
1151-
; AVX1-NEXT: movb %al, (%rdi)
1152-
; AVX1-NEXT: retq
1153-
;
1154-
; AVX2-LABEL: usubo_v4i1:
1155-
; AVX2: # %bb.0:
1156-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
1157-
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
1158-
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
1159-
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
1160-
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm0
1161-
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
1162-
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1163-
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
1164-
; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
1165-
; AVX2-NEXT: vmovmskps %xmm1, %eax
1166-
; AVX2-NEXT: movb %al, (%rdi)
1167-
; AVX2-NEXT: retq
1135+
; AVX-LABEL: usubo_v4i1:
1136+
; AVX: # %bb.0:
1137+
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm2
1138+
; AVX-NEXT: vpslld $31, %xmm2, %xmm2
1139+
; AVX-NEXT: vmovmskps %xmm2, %eax
1140+
; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
1141+
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
1142+
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
1143+
; AVX-NEXT: movb %al, (%rdi)
1144+
; AVX-NEXT: retq
11681145
;
11691146
; AVX512-LABEL: usubo_v4i1:
11701147
; AVX512: # %bb.0:
11711148
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
11721149
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
11731150
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
11741151
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1
1175-
; AVX512-NEXT: kxorw %k1, %k0, %k1
1176-
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1}
1152+
; AVX512-NEXT: kxorw %k1, %k0, %k0
1153+
; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1}
11771154
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1178-
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
1179-
; AVX512-NEXT: kshiftlw $12, %k1, %k0
1155+
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1156+
; AVX512-NEXT: kshiftlw $12, %k0, %k0
11801157
; AVX512-NEXT: kshiftrw $12, %k0, %k0
11811158
; AVX512-NEXT: kmovd %k0, %eax
11821159
; AVX512-NEXT: movb %al, (%rdi)

0 commit comments

Comments
 (0)