Skip to content

Commit b4b97ec

Browse files
committed
[x86] try harder to scalarize a vector load with extracted integer op uses
extract_vec_elt (load X), C --> scalar load (X+C) As noted in the comment, DAGCombiner has this fold -- and the code in this patch is adapted from DAGCombiner::scalarizeExtractedVectorLoad() -- but x86 should benefit even if the loaded vector has other uses as long as we apply some other x86-specific conditions. The motivating example from #50310 is shown in vec_int_to_fp.ll. Fixes #50310 Differential Revision: https://reviews.llvm.org/D118376
1 parent 588f121 commit b4b97ec

11 files changed

+410
-597
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43086,6 +43086,38 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
4308643086
}
4308743087
}
4308843088

43089+
// If this extract is from a loaded vector value and will be used as an
43090+
// integer, that requires a potentially expensive XMM -> GPR transfer.
43091+
// Additionally, if we can convert to a scalar integer load, that will likely
43092+
// be folded into a subsequent integer op.
43093+
// Note: Unlike the related fold for this in DAGCombiner, this is not limited
43094+
// to a single-use of the loaded vector. For the reasons above, we
43095+
// expect this to be profitable even if it creates an extra load.
43096+
bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
43097+
return Use->getOpcode() == ISD::STORE ||
43098+
Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
43099+
Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
43100+
});
43101+
auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
43102+
if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
43103+
SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
43104+
!LikelyUsedAsVector) {
43105+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43106+
SDValue NewPtr =
43107+
TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
43108+
unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
43109+
MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
43110+
Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
43111+
SDValue Load =
43112+
DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
43113+
LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
43114+
SDValue Chain = Load.getValue(1);
43115+
SDValue From[] = {SDValue(N, 0), SDValue(LoadVec, 1)};
43116+
SDValue To[] = {Load, Chain};
43117+
DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
43118+
return SDValue(N, 0);
43119+
}
43120+
4308943121
return SDValue();
4309043122
}
4309143123

llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010
define <4 x i32> @test(<4 x i32>* %p) {
1111
; CHECK-LABEL: test:
1212
; CHECK: # %bb.0:
13-
; CHECK-NEXT: movaps (%rdi), %xmm0
14-
; CHECK-NEXT: extractps $2, %xmm0, %eax
15-
; CHECK-NEXT: cmpl $3, %eax
16-
; CHECK-NEXT: je .LBB0_2
17-
; CHECK-NEXT: # %bb.1:
13+
; CHECK-NEXT: cmpl $3, 8(%rdi)
14+
; CHECK-NEXT: je .LBB0_1
15+
; CHECK-NEXT: # %bb.2:
1816
; CHECK-NEXT: xorps %xmm0, %xmm0
19-
; CHECK-NEXT: .LBB0_2:
17+
; CHECK-NEXT: retq
18+
; CHECK-NEXT: .LBB0_1:
19+
; CHECK-NEXT: movaps (%rdi), %xmm0
2020
; CHECK-NEXT: retq
2121
%v = load <4 x i32>, <4 x i32>* %p
2222
%e = extractelement <4 x i32> %v, i32 2

llvm/test/CodeGen/X86/avx512-cvt.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -148,18 +148,12 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
148148
define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
149149
; NODQ-LABEL: slto4f32_mem:
150150
; NODQ: # %bb.0:
151-
; NODQ-NEXT: vmovdqu (%rdi), %xmm0
152-
; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1
153-
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
154-
; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
155-
; NODQ-NEXT: vmovq %xmm0, %rax
156-
; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
157-
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
158-
; NODQ-NEXT: vmovq %xmm1, %rax
159-
; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
160-
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
161-
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
162-
; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
151+
; NODQ-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
152+
; NODQ-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1
153+
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
154+
; NODQ-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
155+
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
156+
; NODQ-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
163157
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
164158
; NODQ-NEXT: retq
165159
;

llvm/test/CodeGen/X86/bitcast-vector-bool.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -542,10 +542,8 @@ define i32 @bitcast_v64i8_to_v2i32(<64 x i8> %a0) nounwind {
542542
; AVX512: # %bb.0:
543543
; AVX512-NEXT: vpmovb2m %zmm0, %k0
544544
; AVX512-NEXT: kmovq %k0, -{{[0-9]+}}(%rsp)
545-
; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
546-
; AVX512-NEXT: vmovd %xmm0, %ecx
547-
; AVX512-NEXT: vpextrd $1, %xmm0, %eax
548-
; AVX512-NEXT: addl %ecx, %eax
545+
; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax
546+
; AVX512-NEXT: addl -{{[0-9]+}}(%rsp), %eax
549547
; AVX512-NEXT: vzeroupper
550548
; AVX512-NEXT: retq
551549
%1 = icmp slt <64 x i8> %a0, zeroinitializer

llvm/test/CodeGen/X86/extractelement-load.ll

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -301,33 +301,35 @@ define void @subextract_broadcast_load_constant(<2 x i16>* nocapture %0, i16* no
301301
ret void
302302
}
303303

304+
; A scalar load is favored over a XMM->GPR register transfer in this example.
305+
304306
define i32 @multi_use_load_scalarization(<4 x i32>* %p) {
305307
; X32-SSE2-LABEL: multi_use_load_scalarization:
306308
; X32-SSE2: # %bb.0:
307309
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
310+
; X32-SSE2-NEXT: movl (%ecx), %eax
308311
; X32-SSE2-NEXT: movdqu (%ecx), %xmm0
309312
; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
310-
; X32-SSE2-NEXT: movd %xmm0, %eax
311313
; X32-SSE2-NEXT: psubd %xmm1, %xmm0
312314
; X32-SSE2-NEXT: movdqa %xmm0, (%ecx)
313315
; X32-SSE2-NEXT: retl
314316
;
315317
; X64-SSSE3-LABEL: multi_use_load_scalarization:
316318
; X64-SSSE3: # %bb.0:
319+
; X64-SSSE3-NEXT: movl (%rdi), %eax
317320
; X64-SSSE3-NEXT: movdqu (%rdi), %xmm0
318321
; X64-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
319-
; X64-SSSE3-NEXT: movd %xmm0, %eax
320322
; X64-SSSE3-NEXT: psubd %xmm1, %xmm0
321323
; X64-SSSE3-NEXT: movdqa %xmm0, (%rdi)
322324
; X64-SSSE3-NEXT: retq
323325
;
324326
; X64-AVX-LABEL: multi_use_load_scalarization:
325327
; X64-AVX: # %bb.0:
328+
; X64-AVX-NEXT: movl (%rdi), %eax
326329
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
327330
; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
328-
; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1
329-
; X64-AVX-NEXT: vmovdqa %xmm1, (%rdi)
330-
; X64-AVX-NEXT: vmovd %xmm0, %eax
331+
; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
332+
; X64-AVX-NEXT: vmovdqa %xmm0, (%rdi)
331333
; X64-AVX-NEXT: retq
332334
%v = load <4 x i32>, <4 x i32>* %p, align 1
333335
%v1 = add <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1>

llvm/test/CodeGen/X86/oddsubvector.ll

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -161,46 +161,46 @@ define <16 x i32> @PR42819(<8 x i32>* %a0) {
161161
define void @PR42833() {
162162
; SSE2-LABEL: PR42833:
163163
; SSE2: # %bb.0:
164-
; SSE2-NEXT: movdqa c+144(%rip), %xmm1
165-
; SSE2-NEXT: movdqa c+128(%rip), %xmm0
166-
; SSE2-NEXT: movd %xmm0, %eax
167-
; SSE2-NEXT: addl b(%rip), %eax
164+
; SSE2-NEXT: movl b(%rip), %eax
165+
; SSE2-NEXT: movdqa c+144(%rip), %xmm0
166+
; SSE2-NEXT: movdqa c+128(%rip), %xmm1
167+
; SSE2-NEXT: addl c+128(%rip), %eax
168168
; SSE2-NEXT: movd %eax, %xmm2
169169
; SSE2-NEXT: movd %eax, %xmm3
170-
; SSE2-NEXT: paddd %xmm0, %xmm3
170+
; SSE2-NEXT: paddd %xmm1, %xmm3
171171
; SSE2-NEXT: movdqa d+144(%rip), %xmm4
172-
; SSE2-NEXT: psubd %xmm1, %xmm4
173-
; SSE2-NEXT: paddd %xmm1, %xmm1
174-
; SSE2-NEXT: movdqa %xmm0, %xmm5
175-
; SSE2-NEXT: paddd %xmm0, %xmm5
172+
; SSE2-NEXT: psubd %xmm0, %xmm4
173+
; SSE2-NEXT: paddd %xmm0, %xmm0
174+
; SSE2-NEXT: movdqa %xmm1, %xmm5
175+
; SSE2-NEXT: paddd %xmm1, %xmm5
176176
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
177-
; SSE2-NEXT: movdqa %xmm1, c+144(%rip)
177+
; SSE2-NEXT: movdqa %xmm0, c+144(%rip)
178178
; SSE2-NEXT: movaps %xmm5, c+128(%rip)
179-
; SSE2-NEXT: movdqa c+160(%rip), %xmm1
179+
; SSE2-NEXT: movdqa c+160(%rip), %xmm0
180180
; SSE2-NEXT: movdqa c+176(%rip), %xmm3
181181
; SSE2-NEXT: movdqa d+160(%rip), %xmm5
182182
; SSE2-NEXT: movdqa d+176(%rip), %xmm6
183183
; SSE2-NEXT: movdqa d+128(%rip), %xmm7
184-
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
185-
; SSE2-NEXT: psubd %xmm0, %xmm7
184+
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
185+
; SSE2-NEXT: psubd %xmm1, %xmm7
186186
; SSE2-NEXT: psubd %xmm3, %xmm6
187-
; SSE2-NEXT: psubd %xmm1, %xmm5
187+
; SSE2-NEXT: psubd %xmm0, %xmm5
188188
; SSE2-NEXT: movdqa %xmm5, d+160(%rip)
189189
; SSE2-NEXT: movdqa %xmm6, d+176(%rip)
190190
; SSE2-NEXT: movdqa %xmm4, d+144(%rip)
191191
; SSE2-NEXT: movdqa %xmm7, d+128(%rip)
192192
; SSE2-NEXT: paddd %xmm3, %xmm3
193-
; SSE2-NEXT: paddd %xmm1, %xmm1
194-
; SSE2-NEXT: movdqa %xmm1, c+160(%rip)
193+
; SSE2-NEXT: paddd %xmm0, %xmm0
194+
; SSE2-NEXT: movdqa %xmm0, c+160(%rip)
195195
; SSE2-NEXT: movdqa %xmm3, c+176(%rip)
196196
; SSE2-NEXT: retq
197197
;
198198
; SSE42-LABEL: PR42833:
199199
; SSE42: # %bb.0:
200+
; SSE42-NEXT: movl b(%rip), %eax
200201
; SSE42-NEXT: movdqa c+144(%rip), %xmm0
201202
; SSE42-NEXT: movdqa c+128(%rip), %xmm1
202-
; SSE42-NEXT: movd %xmm1, %eax
203-
; SSE42-NEXT: addl b(%rip), %eax
203+
; SSE42-NEXT: addl c+128(%rip), %eax
204204
; SSE42-NEXT: movd %eax, %xmm2
205205
; SSE42-NEXT: paddd %xmm1, %xmm2
206206
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
@@ -232,20 +232,20 @@ define void @PR42833() {
232232
;
233233
; AVX1-LABEL: PR42833:
234234
; AVX1: # %bb.0:
235-
; AVX1-NEXT: vmovdqa c+128(%rip), %xmm0
236-
; AVX1-NEXT: vmovd %xmm0, %eax
237-
; AVX1-NEXT: addl b(%rip), %eax
238-
; AVX1-NEXT: vmovd %eax, %xmm1
239-
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
240-
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
235+
; AVX1-NEXT: movl b(%rip), %eax
236+
; AVX1-NEXT: addl c+128(%rip), %eax
237+
; AVX1-NEXT: vmovd %eax, %xmm0
238+
; AVX1-NEXT: vmovdqa c+128(%rip), %xmm1
239+
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
240+
; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2
241241
; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3
242242
; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3
243243
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
244-
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
244+
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
245245
; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2
246246
; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
247-
; AVX1-NEXT: vmovups %ymm1, c+128(%rip)
248-
; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
247+
; AVX1-NEXT: vmovups %ymm0, c+128(%rip)
248+
; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0
249249
; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1
250250
; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
251251
; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1
@@ -314,20 +314,20 @@ define void @PR42833() {
314314
;
315315
; XOP-LABEL: PR42833:
316316
; XOP: # %bb.0:
317-
; XOP-NEXT: vmovdqa c+128(%rip), %xmm0
318-
; XOP-NEXT: vmovd %xmm0, %eax
319-
; XOP-NEXT: addl b(%rip), %eax
320-
; XOP-NEXT: vmovd %eax, %xmm1
321-
; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
322-
; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2
317+
; XOP-NEXT: movl b(%rip), %eax
318+
; XOP-NEXT: addl c+128(%rip), %eax
319+
; XOP-NEXT: vmovd %eax, %xmm0
320+
; XOP-NEXT: vmovdqa c+128(%rip), %xmm1
321+
; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm0
322+
; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm2
323323
; XOP-NEXT: vmovdqa c+144(%rip), %xmm3
324324
; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3
325325
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
326-
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
326+
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
327327
; XOP-NEXT: vmovdqa d+144(%rip), %xmm2
328328
; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
329-
; XOP-NEXT: vmovups %ymm1, c+128(%rip)
330-
; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
329+
; XOP-NEXT: vmovups %ymm0, c+128(%rip)
330+
; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0
331331
; XOP-NEXT: vmovdqa d+128(%rip), %xmm1
332332
; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0
333333
; XOP-NEXT: vmovdqa d+176(%rip), %xmm1

llvm/test/CodeGen/X86/pr45378.ll

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,28 +76,23 @@ define i1 @parseHeaders2_scalar_and(i64 * %ptr) nounwind {
7676
; SSE2-LABEL: parseHeaders2_scalar_and:
7777
; SSE2: # %bb.0:
7878
; SSE2-NEXT: movdqu (%rdi), %xmm0
79-
; SSE2-NEXT: movq %xmm0, %rax
8079
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
81-
; SSE2-NEXT: movq %xmm0, %rcx
82-
; SSE2-NEXT: testq %rcx, %rax
80+
; SSE2-NEXT: movq %xmm0, %rax
81+
; SSE2-NEXT: testq %rax, (%rdi)
8382
; SSE2-NEXT: sete %al
8483
; SSE2-NEXT: retq
8584
;
8685
; SSE41-LABEL: parseHeaders2_scalar_and:
8786
; SSE41: # %bb.0:
88-
; SSE41-NEXT: movdqu (%rdi), %xmm0
89-
; SSE41-NEXT: movq %xmm0, %rax
90-
; SSE41-NEXT: pextrq $1, %xmm0, %rcx
91-
; SSE41-NEXT: testq %rcx, %rax
87+
; SSE41-NEXT: movq (%rdi), %rax
88+
; SSE41-NEXT: testq %rax, 8(%rdi)
9289
; SSE41-NEXT: sete %al
9390
; SSE41-NEXT: retq
9491
;
9592
; AVX-LABEL: parseHeaders2_scalar_and:
9693
; AVX: # %bb.0:
97-
; AVX-NEXT: vmovdqu (%rdi), %xmm0
98-
; AVX-NEXT: vmovq %xmm0, %rax
99-
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
100-
; AVX-NEXT: testq %rcx, %rax
94+
; AVX-NEXT: movq (%rdi), %rax
95+
; AVX-NEXT: testq %rax, 8(%rdi)
10196
; AVX-NEXT: sete %al
10297
; AVX-NEXT: retq
10398
%vptr = bitcast i64 * %ptr to <2 x i64> *

llvm/test/CodeGen/X86/scalar_widen_div.ll

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -403,32 +403,29 @@ define void @test_int_div(<3 x i32>* %dest, <3 x i32>* %old, i32 %n) {
403403
; CHECK-NEXT: testl %edx, %edx
404404
; CHECK-NEXT: jle .LBB12_3
405405
; CHECK-NEXT: # %bb.1: # %bb.nph
406-
; CHECK-NEXT: movl %edx, %r9d
406+
; CHECK-NEXT: movl %edx, %r11d
407407
; CHECK-NEXT: xorl %ecx, %ecx
408408
; CHECK-NEXT: .p2align 4, 0x90
409409
; CHECK-NEXT: .LBB12_2: # %for.body
410410
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
411-
; CHECK-NEXT: movdqa (%rdi,%rcx), %xmm0
412-
; CHECK-NEXT: movdqa (%rsi,%rcx), %xmm1
413-
; CHECK-NEXT: pextrd $1, %xmm0, %eax
414-
; CHECK-NEXT: pextrd $1, %xmm1, %r8d
411+
; CHECK-NEXT: movl 8(%rdi,%rcx), %r8d
412+
; CHECK-NEXT: movl (%rdi,%rcx), %r9d
413+
; CHECK-NEXT: movl 4(%rdi,%rcx), %eax
415414
; CHECK-NEXT: cltd
416-
; CHECK-NEXT: idivl %r8d
417-
; CHECK-NEXT: movl %eax, %r8d
418-
; CHECK-NEXT: movd %xmm0, %eax
419-
; CHECK-NEXT: movd %xmm1, %r10d
415+
; CHECK-NEXT: idivl 4(%rsi,%rcx)
416+
; CHECK-NEXT: movl %eax, %r10d
417+
; CHECK-NEXT: movl %r9d, %eax
420418
; CHECK-NEXT: cltd
421-
; CHECK-NEXT: idivl %r10d
422-
; CHECK-NEXT: movd %eax, %xmm2
423-
; CHECK-NEXT: pinsrd $1, %r8d, %xmm2
424-
; CHECK-NEXT: pextrd $2, %xmm0, %eax
425-
; CHECK-NEXT: pextrd $2, %xmm1, %r8d
419+
; CHECK-NEXT: idivl (%rsi,%rcx)
420+
; CHECK-NEXT: movd %eax, %xmm0
421+
; CHECK-NEXT: pinsrd $1, %r10d, %xmm0
422+
; CHECK-NEXT: movl %r8d, %eax
426423
; CHECK-NEXT: cltd
427-
; CHECK-NEXT: idivl %r8d
424+
; CHECK-NEXT: idivl 8(%rsi,%rcx)
428425
; CHECK-NEXT: movl %eax, 8(%rdi,%rcx)
429-
; CHECK-NEXT: movq %xmm2, (%rdi,%rcx)
426+
; CHECK-NEXT: movq %xmm0, (%rdi,%rcx)
430427
; CHECK-NEXT: addq $16, %rcx
431-
; CHECK-NEXT: decl %r9d
428+
; CHECK-NEXT: decl %r11d
432429
; CHECK-NEXT: jne .LBB12_2
433430
; CHECK-NEXT: .LBB12_3: # %for.end
434431
; CHECK-NEXT: retq

0 commit comments

Comments
 (0)