Skip to content

Commit 5bd374d

Browse files
committed
[X86] psadbw.ll - add AVX2 target test coverage
1 parent 12ade6f commit 5bd374d

File tree

1 file changed

+134
-90
lines changed

1 file changed

+134
-90
lines changed

llvm/test/CodeGen/X86/psadbw.ll

Lines changed: 134 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,44 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64
2+
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
45

56
; Only bottom 16 bits are set - upper 48 bits are zero.
67
define <2 x i64> @combine_psadbw_shift(<16 x i8> %0, <16 x i8> %1) nounwind {
7-
; CHECK-LABEL: combine_psadbw_shift:
8-
; CHECK: # %bb.0:
9-
; CHECK-NEXT: xorps %xmm0, %xmm0
10-
; CHECK-NEXT: ret{{[l|q]}}
8+
; SSE-LABEL: combine_psadbw_shift:
9+
; SSE: # %bb.0:
10+
; SSE-NEXT: xorps %xmm0, %xmm0
11+
; SSE-NEXT: ret{{[l|q]}}
12+
;
13+
; AVX2-LABEL: combine_psadbw_shift:
14+
; AVX2: # %bb.0:
15+
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
16+
; AVX2-NEXT: retq
1117
%3 = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, <16 x i8> %1)
1218
%4 = lshr <2 x i64> %3, <i64 48, i64 48>
1319
ret <2 x i64> %4
1420
}
1521

1622
; Propagate the demanded result elements to the 8 aliasing source elements.
1723
define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
18-
; X86-LABEL: combine_psadbw_demandedelt:
19-
; X86: # %bb.0:
20-
; X86-NEXT: psadbw %xmm1, %xmm0
21-
; X86-NEXT: movd %xmm0, %eax
22-
; X86-NEXT: xorl %edx, %edx
23-
; X86-NEXT: retl
24+
; X86-SSE-LABEL: combine_psadbw_demandedelt:
25+
; X86-SSE: # %bb.0:
26+
; X86-SSE-NEXT: psadbw %xmm1, %xmm0
27+
; X86-SSE-NEXT: movd %xmm0, %eax
28+
; X86-SSE-NEXT: xorl %edx, %edx
29+
; X86-SSE-NEXT: retl
30+
;
31+
; X64-SSE-LABEL: combine_psadbw_demandedelt:
32+
; X64-SSE: # %bb.0:
33+
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
34+
; X64-SSE-NEXT: movq %xmm0, %rax
35+
; X64-SSE-NEXT: retq
2436
;
25-
; X64-LABEL: combine_psadbw_demandedelt:
26-
; X64: # %bb.0:
27-
; X64-NEXT: psadbw %xmm1, %xmm0
28-
; X64-NEXT: movq %xmm0, %rax
29-
; X64-NEXT: retq
37+
; AVX2-LABEL: combine_psadbw_demandedelt:
38+
; AVX2: # %bb.0:
39+
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
40+
; AVX2-NEXT: vmovq %xmm0, %rax
41+
; AVX2-NEXT: retq
3042
%3 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
3143
%4 = shufflevector <16 x i8> %1, <16 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
3244
%5 = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %3, <16 x i8> %4)
@@ -36,25 +48,33 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
3648

3749
; TODO: Each PSADBW source element has a maximum value of 3 - so max sum-of-diffs for each <8 x i8> should be 24.
3850
define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
39-
; X86-LABEL: combine_psadbw_cmp_knownbits:
40-
; X86: # %bb.0:
41-
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
42-
; X86-NEXT: pxor %xmm1, %xmm1
43-
; X86-NEXT: psadbw %xmm0, %xmm1
44-
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
45-
; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
46-
; X86-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
47-
; X86-NEXT: retl
51+
; X86-SSE-LABEL: combine_psadbw_cmp_knownbits:
52+
; X86-SSE: # %bb.0:
53+
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
54+
; X86-SSE-NEXT: pxor %xmm1, %xmm1
55+
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
56+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
57+
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
58+
; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
59+
; X86-SSE-NEXT: retl
60+
;
61+
; X64-SSE-LABEL: combine_psadbw_cmp_knownbits:
62+
; X64-SSE: # %bb.0:
63+
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
64+
; X64-SSE-NEXT: pxor %xmm1, %xmm1
65+
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
66+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
67+
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
68+
; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
69+
; X64-SSE-NEXT: retq
4870
;
49-
; X64-LABEL: combine_psadbw_cmp_knownbits:
50-
; X64: # %bb.0:
51-
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
52-
; X64-NEXT: pxor %xmm1, %xmm1
53-
; X64-NEXT: psadbw %xmm0, %xmm1
54-
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
55-
; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
56-
; X64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
57-
; X64-NEXT: retq
71+
; AVX2-LABEL: combine_psadbw_cmp_knownbits:
72+
; AVX2: # %bb.0:
73+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
74+
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
75+
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
76+
; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
77+
; AVX2-NEXT: retq
5878
%mask = and <16 x i8> %a0, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
5979
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
6080
%cmp = icmp sgt <2 x i64> %sad, <i64 32, i64 32>
@@ -64,42 +84,53 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
6484

6585
; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
6686
define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
67-
; X86-LABEL: combine_psadbw_sitofp_knownbits:
68-
; X86: # %bb.0:
69-
; X86-NEXT: pushl %ebp
70-
; X86-NEXT: movl %esp, %ebp
71-
; X86-NEXT: andl $-8, %esp
72-
; X86-NEXT: subl $32, %esp
73-
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
74-
; X86-NEXT: pxor %xmm1, %xmm1
75-
; X86-NEXT: psadbw %xmm0, %xmm1
76-
; X86-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
77-
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
78-
; X86-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
79-
; X86-NEXT: fildll {{[0-9]+}}(%esp)
80-
; X86-NEXT: fstpl {{[0-9]+}}(%esp)
81-
; X86-NEXT: fildll {{[0-9]+}}(%esp)
82-
; X86-NEXT: fstpl (%esp)
83-
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
84-
; X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
85-
; X86-NEXT: movl %ebp, %esp
86-
; X86-NEXT: popl %ebp
87-
; X86-NEXT: retl
87+
; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
88+
; X86-SSE: # %bb.0:
89+
; X86-SSE-NEXT: pushl %ebp
90+
; X86-SSE-NEXT: movl %esp, %ebp
91+
; X86-SSE-NEXT: andl $-8, %esp
92+
; X86-SSE-NEXT: subl $32, %esp
93+
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
94+
; X86-SSE-NEXT: pxor %xmm1, %xmm1
95+
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
96+
; X86-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
97+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
98+
; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
99+
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
100+
; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
101+
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
102+
; X86-SSE-NEXT: fstpl (%esp)
103+
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
104+
; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
105+
; X86-SSE-NEXT: movl %ebp, %esp
106+
; X86-SSE-NEXT: popl %ebp
107+
; X86-SSE-NEXT: retl
88108
;
89-
; X64-LABEL: combine_psadbw_sitofp_knownbits:
90-
; X64: # %bb.0:
91-
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
92-
; X64-NEXT: pxor %xmm1, %xmm1
93-
; X64-NEXT: psadbw %xmm0, %xmm1
94-
; X64-NEXT: movd %xmm1, %eax
95-
; X64-NEXT: xorps %xmm0, %xmm0
96-
; X64-NEXT: cvtsi2sd %eax, %xmm0
97-
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
98-
; X64-NEXT: movd %xmm1, %eax
99-
; X64-NEXT: xorps %xmm1, %xmm1
100-
; X64-NEXT: cvtsi2sd %eax, %xmm1
101-
; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
102-
; X64-NEXT: retq
109+
; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
110+
; X64-SSE: # %bb.0:
111+
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
112+
; X64-SSE-NEXT: pxor %xmm1, %xmm1
113+
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
114+
; X64-SSE-NEXT: movd %xmm1, %eax
115+
; X64-SSE-NEXT: xorps %xmm0, %xmm0
116+
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
117+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
118+
; X64-SSE-NEXT: movd %xmm1, %eax
119+
; X64-SSE-NEXT: xorps %xmm1, %xmm1
120+
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
121+
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
122+
; X64-SSE-NEXT: retq
123+
;
124+
; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
125+
; AVX2: # %bb.0:
126+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
127+
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
128+
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
129+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
130+
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
131+
; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
132+
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
133+
; AVX2-NEXT: retq
103134
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
104135
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
105136
%cvt = sitofp <2 x i64> %sad to <2 x double>
@@ -108,27 +139,40 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
108139

109140
; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
110141
define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
111-
; X86-LABEL: combine_psadbw_uitofp_knownbits:
112-
; X86: # %bb.0:
113-
; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
114-
; X86-NEXT: pxor %xmm1, %xmm1
115-
; X86-NEXT: psadbw %xmm1, %xmm0
116-
; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
117-
; X86-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
118-
; X86-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
119-
; X86-NEXT: addpd %xmm1, %xmm0
120-
; X86-NEXT: retl
142+
; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
143+
; X86-SSE: # %bb.0:
144+
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
145+
; X86-SSE-NEXT: pxor %xmm1, %xmm1
146+
; X86-SSE-NEXT: psadbw %xmm1, %xmm0
147+
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
148+
; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
149+
; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
150+
; X86-SSE-NEXT: addpd %xmm1, %xmm0
151+
; X86-SSE-NEXT: retl
152+
;
153+
; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
154+
; X64-SSE: # %bb.0:
155+
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
156+
; X64-SSE-NEXT: pxor %xmm1, %xmm1
157+
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
158+
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
159+
; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
160+
; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
161+
; X64-SSE-NEXT: addpd %xmm1, %xmm0
162+
; X64-SSE-NEXT: retq
121163
;
122-
; X64-LABEL: combine_psadbw_uitofp_knownbits:
123-
; X64: # %bb.0:
124-
; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
125-
; X64-NEXT: pxor %xmm1, %xmm1
126-
; X64-NEXT: psadbw %xmm1, %xmm0
127-
; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
128-
; X64-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
129-
; X64-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
130-
; X64-NEXT: addpd %xmm1, %xmm0
131-
; X64-NEXT: retq
164+
; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
165+
; AVX2: # %bb.0:
166+
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167+
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
168+
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
169+
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
170+
; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171+
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
172+
; AVX2-NEXT: # xmm1 = mem[0,0]
173+
; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
174+
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
175+
; AVX2-NEXT: retq
132176
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
133177
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
134178
%cvt = uitofp <2 x i64> %sad to <2 x double>

0 commit comments

Comments
 (0)