@@ -141,56 +141,61 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
141
141
define <8 x half > @fmul_pow2_8xhalf (<8 x i16 > %i ) {
142
142
; CHECK-SSE-LABEL: fmul_pow2_8xhalf:
143
143
; CHECK-SSE: # %bb.0:
144
- ; CHECK-SSE-NEXT: subq $88 , %rsp
145
- ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 96
144
+ ; CHECK-SSE-NEXT: subq $104 , %rsp
145
+ ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 112
146
146
; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1
147
147
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
148
148
; CHECK-SSE-NEXT: pslld $23, %xmm1
149
149
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
150
150
; CHECK-SSE-NEXT: paddd %xmm2, %xmm1
151
151
; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1
152
- ; CHECK-SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
152
+ ; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
153
+ ; CHECK-SSE-NEXT: pslld $16, %xmm1
154
+ ; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
153
155
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
154
156
; CHECK-SSE-NEXT: pslld $23, %xmm0
155
157
; CHECK-SSE-NEXT: paddd %xmm2, %xmm0
156
158
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
159
+ ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
157
160
; CHECK-SSE-NEXT: pslld $16, %xmm0
158
- ; CHECK-SSE-NEXT: psrld $16, %xmm0
159
161
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
160
- ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
162
+ ; CHECK-SSE-NEXT: psrld $16, % xmm0
161
163
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
162
164
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
163
165
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
164
- ; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
166
+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
167
+ ; CHECK-SSE-NEXT: psrlq $48, %xmm0
168
+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
165
169
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
166
170
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
167
- ; CHECK-SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
168
- ; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
171
+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
172
+ ; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
169
173
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
170
174
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
171
175
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
172
- ; CHECK-SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
173
- ; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3]
174
- ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
176
+ ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
177
+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
178
+ ; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
179
+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
175
180
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
176
181
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
177
182
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
178
- ; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
179
- ; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
180
- ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
183
+ ; CHECK-SSE-NEXT: psrld $16, %xmm0
181
184
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
182
185
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
183
186
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
184
- ; CHECK-SSE-NEXT: cvtdq2ps (%rsp), %xmm0 # 16-byte Folded Reload
187
+ ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
188
+ ; CHECK-SSE-NEXT: psrlq $48, %xmm0
189
+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
185
190
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
186
191
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
187
- ; CHECK-SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
188
- ; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
192
+ ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
193
+ ; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
189
194
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
190
195
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
191
196
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
192
- ; CHECK-SSE-NEXT: pshufd $255, (%rsp ), %xmm0 # 16-byte Folded Reload
193
- ; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3 ]
197
+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p ), %xmm0 # 16-byte Reload
198
+ ; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4], mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7 ]
194
199
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
195
200
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
196
201
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
@@ -202,39 +207,39 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
202
207
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
203
208
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
204
209
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
205
- ; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
206
- ; CHECK-SSE-NEXT: # xmm0 = xmm0 [0],mem [0],xmm0 [1],mem [1],xmm0 [2],mem [2],xmm0 [3],mem [3]
207
- ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
210
+ ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
211
+ ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1 [0],xmm0 [0],xmm1 [1],xmm0 [1],xmm1 [2],xmm0 [2],xmm1 [3],xmm0 [3]
212
+ ; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp ) # 16-byte Spill
208
213
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
209
214
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
210
215
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
211
216
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
212
217
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
213
- ; CHECK-SSE-NEXT: movaps %xmm0, (%rsp ) # 16-byte Spill
218
+ ; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
214
219
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
215
220
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
216
221
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
217
222
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
218
223
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
219
- ; CHECK-SSE-NEXT: movdqa (%rsp ), %xmm1 # 16-byte Reload
220
- ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1 [0],xmm0 [0],xmm1 [1],xmm0 [1],xmm1 [2],xmm0 [2],xmm1 [3],xmm0 [3]
221
- ; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p ), %xmm1 # 16-byte Folded Reload
222
- ; CHECK-SSE-NEXT: # xmm1 = xmm1 [0],mem[0],xmm1 [1],mem[1]
223
- ; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp ) # 16-byte Spill
224
+ ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p ), %xmm0 # 16-byte Folded Reload
225
+ ; CHECK-SSE-NEXT: # xmm0 = xmm0 [0],mem [0],xmm0 [1],mem [1],xmm0 [2],mem [2],xmm0 [3],mem [3]
226
+ ; CHECK-SSE-NEXT: punpckldq (%rsp ), %xmm0 # 16-byte Folded Reload
227
+ ; CHECK-SSE-NEXT: # xmm0 = xmm0 [0],mem[0],xmm0 [1],mem[1]
228
+ ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
224
229
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
225
230
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
226
231
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
227
232
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
228
233
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
229
- ; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
234
+ ; CHECK-SSE-NEXT: movaps %xmm0, (%rsp ) # 16-byte Spill
230
235
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
231
236
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
232
237
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
233
238
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
234
239
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
235
- ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p ), %xmm0 # 16-byte Folded Reload
236
- ; CHECK-SSE-NEXT: # xmm0 = xmm0 [0],mem [0],xmm0 [1],mem [1],xmm0 [2],mem [2],xmm0 [3],mem [3]
237
- ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
240
+ ; CHECK-SSE-NEXT: movdqa (%rsp ), %xmm1 # 16-byte Reload
241
+ ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1 [0],xmm0 [0],xmm1 [1],xmm0 [1],xmm1 [2],xmm0 [2],xmm1 [3],xmm0 [3]
242
+ ; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp ) # 16-byte Spill
238
243
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
239
244
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
240
245
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
@@ -246,14 +251,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
246
251
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
247
252
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
248
253
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
249
- ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
250
- ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
251
- ; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
252
- ; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
253
- ; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
254
- ; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0]
255
- ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
256
- ; CHECK-SSE-NEXT: addq $88, %rsp
254
+ ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
255
+ ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
256
+ ; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
257
+ ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
258
+ ; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
259
+ ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0]
260
+ ; CHECK-SSE-NEXT: addq $104, %rsp
257
261
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
258
262
; CHECK-SSE-NEXT: retq
259
263
;
@@ -1028,17 +1032,17 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
1028
1032
; CHECK-SSE-NEXT: pslld $23, %xmm0
1029
1033
; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1030
1034
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
1031
- ; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
1032
- ; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,2,u,u,u,u,u,u]
1033
- ; CHECK-SSE-NEXT: pxor %xmm0, %xmm0
1034
- ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1035
- ; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1036
- ; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
1035
+ ; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1036
+ ; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,u,u,u,u,u,u]
1037
+ ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1038
+ ; CHECK-SSE-NEXT: psrld $16, %xmm0
1039
+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1037
1040
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
1038
1041
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1039
- ; CHECK-SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1040
- ; CHECK-SSE-NEXT: # xmm0 = mem[1,1,1,1]
1041
- ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1042
+ ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
1043
+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1044
+ ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1045
+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
1042
1046
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
1043
1047
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
1044
1048
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1049,8 +1053,9 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
1049
1053
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
1050
1054
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1051
1055
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
1052
- ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1053
- ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1056
+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1057
+ ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1058
+ ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
1054
1059
; CHECK-SSE-NEXT: addq $40, %rsp
1055
1060
; CHECK-SSE-NEXT: retq
1056
1061
;
0 commit comments