1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
- ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86
3
- ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64
2
+ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
3
+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
4
+ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
4
5
5
6
; Only bottom 16 bits are set - upper 48 bits are zero.
6
7
define <2 x i64 > @combine_psadbw_shift (<16 x i8 > %0 , <16 x i8 > %1 ) nounwind {
7
- ; CHECK-LABEL: combine_psadbw_shift:
8
- ; CHECK: # %bb.0:
9
- ; CHECK-NEXT: xorps %xmm0, %xmm0
10
- ; CHECK-NEXT: ret{{[l|q]}}
8
+ ; SSE-LABEL: combine_psadbw_shift:
9
+ ; SSE: # %bb.0:
10
+ ; SSE-NEXT: xorps %xmm0, %xmm0
11
+ ; SSE-NEXT: ret{{[l|q]}}
12
+ ;
13
+ ; AVX2-LABEL: combine_psadbw_shift:
14
+ ; AVX2: # %bb.0:
15
+ ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
16
+ ; AVX2-NEXT: retq
11
17
%3 = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %0 , <16 x i8 > %1 )
12
18
%4 = lshr <2 x i64 > %3 , <i64 48 , i64 48 >
13
19
ret <2 x i64 > %4
14
20
}
15
21
16
22
; Propagate the demanded result elements to the 8 aliasing source elements.
17
23
define i64 @combine_psadbw_demandedelt (<16 x i8 > %0 , <16 x i8 > %1 ) nounwind {
18
- ; X86-LABEL: combine_psadbw_demandedelt:
19
- ; X86: # %bb.0:
20
- ; X86-NEXT: psadbw %xmm1, %xmm0
21
- ; X86-NEXT: movd %xmm0, %eax
22
- ; X86-NEXT: xorl %edx, %edx
23
- ; X86-NEXT: retl
24
+ ; X86-SSE-LABEL: combine_psadbw_demandedelt:
25
+ ; X86-SSE: # %bb.0:
26
+ ; X86-SSE-NEXT: psadbw %xmm1, %xmm0
27
+ ; X86-SSE-NEXT: movd %xmm0, %eax
28
+ ; X86-SSE-NEXT: xorl %edx, %edx
29
+ ; X86-SSE-NEXT: retl
30
+ ;
31
+ ; X64-SSE-LABEL: combine_psadbw_demandedelt:
32
+ ; X64-SSE: # %bb.0:
33
+ ; X64-SSE-NEXT: psadbw %xmm1, %xmm0
34
+ ; X64-SSE-NEXT: movq %xmm0, %rax
35
+ ; X64-SSE-NEXT: retq
24
36
;
25
- ; X64 -LABEL: combine_psadbw_demandedelt:
26
- ; X64 : # %bb.0:
27
- ; X64 -NEXT: psadbw %xmm1, %xmm0
28
- ; X64 -NEXT: movq %xmm0, %rax
29
- ; X64 -NEXT: retq
37
+ ; AVX2 -LABEL: combine_psadbw_demandedelt:
38
+ ; AVX2 : # %bb.0:
39
+ ; AVX2 -NEXT: vpsadbw %xmm1, %xmm0 , %xmm0
40
+ ; AVX2 -NEXT: vmovq %xmm0, %rax
41
+ ; AVX2 -NEXT: retq
30
42
%3 = shufflevector <16 x i8 > %0 , <16 x i8 > %0 , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 12 , i32 13 , i32 14 , i32 15 , i32 8 , i32 9 , i32 10 , i32 11 >
31
43
%4 = shufflevector <16 x i8 > %1 , <16 x i8 > %1 , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 12 , i32 13 , i32 14 , i32 15 , i32 8 , i32 9 , i32 10 , i32 11 >
32
44
%5 = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %3 , <16 x i8 > %4 )
@@ -36,25 +48,33 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
36
48
37
49
; TODO: Each PSADBW source element has a maximum value of 3 - so max sum-of-diffs for each <8 x i8> should be 24.
38
50
define <2 x i64 > @combine_psadbw_cmp_knownbits (<16 x i8 > %a0 ) nounwind {
39
- ; X86-LABEL: combine_psadbw_cmp_knownbits:
40
- ; X86: # %bb.0:
41
- ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
42
- ; X86-NEXT: pxor %xmm1, %xmm1
43
- ; X86-NEXT: psadbw %xmm0, %xmm1
44
- ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
45
- ; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
46
- ; X86-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
47
- ; X86-NEXT: retl
51
+ ; X86-SSE-LABEL: combine_psadbw_cmp_knownbits:
52
+ ; X86-SSE: # %bb.0:
53
+ ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
54
+ ; X86-SSE-NEXT: pxor %xmm1, %xmm1
55
+ ; X86-SSE-NEXT: psadbw %xmm0, %xmm1
56
+ ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
57
+ ; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
58
+ ; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
59
+ ; X86-SSE-NEXT: retl
60
+ ;
61
+ ; X64-SSE-LABEL: combine_psadbw_cmp_knownbits:
62
+ ; X64-SSE: # %bb.0:
63
+ ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
64
+ ; X64-SSE-NEXT: pxor %xmm1, %xmm1
65
+ ; X64-SSE-NEXT: psadbw %xmm0, %xmm1
66
+ ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
67
+ ; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
68
+ ; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
69
+ ; X64-SSE-NEXT: retq
48
70
;
49
- ; X64-LABEL: combine_psadbw_cmp_knownbits:
50
- ; X64: # %bb.0:
51
- ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
52
- ; X64-NEXT: pxor %xmm1, %xmm1
53
- ; X64-NEXT: psadbw %xmm0, %xmm1
54
- ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
55
- ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
56
- ; X64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
57
- ; X64-NEXT: retq
71
+ ; AVX2-LABEL: combine_psadbw_cmp_knownbits:
72
+ ; AVX2: # %bb.0:
73
+ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
74
+ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
75
+ ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
76
+ ; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
77
+ ; AVX2-NEXT: retq
58
78
%mask = and <16 x i8 > %a0 , <i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 , i8 3 >
59
79
%sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
60
80
%cmp = icmp sgt <2 x i64 > %sad , <i64 32 , i64 32 >
@@ -64,42 +84,53 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
64
84
65
85
; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
66
86
define <2 x double > @combine_psadbw_sitofp_knownbits (<16 x i8 > %a0 ) nounwind {
67
- ; X86-LABEL: combine_psadbw_sitofp_knownbits:
68
- ; X86: # %bb.0:
69
- ; X86-NEXT: pushl %ebp
70
- ; X86-NEXT: movl %esp, %ebp
71
- ; X86-NEXT: andl $-8, %esp
72
- ; X86-NEXT: subl $32, %esp
73
- ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
74
- ; X86-NEXT: pxor %xmm1, %xmm1
75
- ; X86-NEXT: psadbw %xmm0, %xmm1
76
- ; X86-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
77
- ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
78
- ; X86-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
79
- ; X86-NEXT: fildll {{[0-9]+}}(%esp)
80
- ; X86-NEXT: fstpl {{[0-9]+}}(%esp)
81
- ; X86-NEXT: fildll {{[0-9]+}}(%esp)
82
- ; X86-NEXT: fstpl (%esp)
83
- ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
84
- ; X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
85
- ; X86-NEXT: movl %ebp, %esp
86
- ; X86-NEXT: popl %ebp
87
- ; X86-NEXT: retl
87
+ ; X86-SSE- LABEL: combine_psadbw_sitofp_knownbits:
88
+ ; X86-SSE : # %bb.0:
89
+ ; X86-SSE- NEXT: pushl %ebp
90
+ ; X86-SSE- NEXT: movl %esp, %ebp
91
+ ; X86-SSE- NEXT: andl $-8, %esp
92
+ ; X86-SSE- NEXT: subl $32, %esp
93
+ ; X86-SSE- NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
94
+ ; X86-SSE- NEXT: pxor %xmm1, %xmm1
95
+ ; X86-SSE- NEXT: psadbw %xmm0, %xmm1
96
+ ; X86-SSE- NEXT: movq %xmm1, {{[0-9]+}}(%esp)
97
+ ; X86-SSE- NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
98
+ ; X86-SSE- NEXT: movq %xmm0, {{[0-9]+}}(%esp)
99
+ ; X86-SSE- NEXT: fildll {{[0-9]+}}(%esp)
100
+ ; X86-SSE- NEXT: fstpl {{[0-9]+}}(%esp)
101
+ ; X86-SSE- NEXT: fildll {{[0-9]+}}(%esp)
102
+ ; X86-SSE- NEXT: fstpl (%esp)
103
+ ; X86-SSE- NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
104
+ ; X86-SSE- NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
105
+ ; X86-SSE- NEXT: movl %ebp, %esp
106
+ ; X86-SSE- NEXT: popl %ebp
107
+ ; X86-SSE- NEXT: retl
88
108
;
89
- ; X64-LABEL: combine_psadbw_sitofp_knownbits:
90
- ; X64: # %bb.0:
91
- ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
92
- ; X64-NEXT: pxor %xmm1, %xmm1
93
- ; X64-NEXT: psadbw %xmm0, %xmm1
94
- ; X64-NEXT: movd %xmm1, %eax
95
- ; X64-NEXT: xorps %xmm0, %xmm0
96
- ; X64-NEXT: cvtsi2sd %eax, %xmm0
97
- ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
98
- ; X64-NEXT: movd %xmm1, %eax
99
- ; X64-NEXT: xorps %xmm1, %xmm1
100
- ; X64-NEXT: cvtsi2sd %eax, %xmm1
101
- ; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
102
- ; X64-NEXT: retq
109
+ ; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
110
+ ; X64-SSE: # %bb.0:
111
+ ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
112
+ ; X64-SSE-NEXT: pxor %xmm1, %xmm1
113
+ ; X64-SSE-NEXT: psadbw %xmm0, %xmm1
114
+ ; X64-SSE-NEXT: movd %xmm1, %eax
115
+ ; X64-SSE-NEXT: xorps %xmm0, %xmm0
116
+ ; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
117
+ ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
118
+ ; X64-SSE-NEXT: movd %xmm1, %eax
119
+ ; X64-SSE-NEXT: xorps %xmm1, %xmm1
120
+ ; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
121
+ ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
122
+ ; X64-SSE-NEXT: retq
123
+ ;
124
+ ; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
125
+ ; AVX2: # %bb.0:
126
+ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
127
+ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
128
+ ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
129
+ ; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
130
+ ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
131
+ ; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
132
+ ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
133
+ ; AVX2-NEXT: retq
103
134
%mask = and <16 x i8 > %a0 , <i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 >
104
135
%sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
105
136
%cvt = sitofp <2 x i64 > %sad to <2 x double >
@@ -108,27 +139,40 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
108
139
109
140
; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
110
141
define <2 x double > @combine_psadbw_uitofp_knownbits (<16 x i8 > %a0 ) nounwind {
111
- ; X86-LABEL: combine_psadbw_uitofp_knownbits:
112
- ; X86: # %bb.0:
113
- ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
114
- ; X86-NEXT: pxor %xmm1, %xmm1
115
- ; X86-NEXT: psadbw %xmm1, %xmm0
116
- ; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
117
- ; X86-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
118
- ; X86-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
119
- ; X86-NEXT: addpd %xmm1, %xmm0
120
- ; X86-NEXT: retl
142
+ ; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
143
+ ; X86-SSE: # %bb.0:
144
+ ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
145
+ ; X86-SSE-NEXT: pxor %xmm1, %xmm1
146
+ ; X86-SSE-NEXT: psadbw %xmm1, %xmm0
147
+ ; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
148
+ ; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
149
+ ; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
150
+ ; X86-SSE-NEXT: addpd %xmm1, %xmm0
151
+ ; X86-SSE-NEXT: retl
152
+ ;
153
+ ; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
154
+ ; X64-SSE: # %bb.0:
155
+ ; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
156
+ ; X64-SSE-NEXT: pxor %xmm1, %xmm1
157
+ ; X64-SSE-NEXT: psadbw %xmm1, %xmm0
158
+ ; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
159
+ ; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
160
+ ; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
161
+ ; X64-SSE-NEXT: addpd %xmm1, %xmm0
162
+ ; X64-SSE-NEXT: retq
121
163
;
122
- ; X64-LABEL: combine_psadbw_uitofp_knownbits:
123
- ; X64: # %bb.0:
124
- ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
125
- ; X64-NEXT: pxor %xmm1, %xmm1
126
- ; X64-NEXT: psadbw %xmm1, %xmm0
127
- ; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
128
- ; X64-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
129
- ; X64-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
130
- ; X64-NEXT: addpd %xmm1, %xmm0
131
- ; X64-NEXT: retq
164
+ ; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
165
+ ; AVX2: # %bb.0:
166
+ ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167
+ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
168
+ ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
169
+ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
170
+ ; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171
+ ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
172
+ ; AVX2-NEXT: # xmm1 = mem[0,0]
173
+ ; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
174
+ ; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
175
+ ; AVX2-NEXT: retq
132
176
%mask = and <16 x i8 > %a0 , <i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 , i8 1 >
133
177
%sad = tail call <2 x i64 > @llvm.x86.sse2.psad.bw (<16 x i8 > %mask , <16 x i8 > zeroinitializer )
134
178
%cvt = uitofp <2 x i64 > %sad to <2 x double >
0 commit comments