Skip to content

Commit c486b82

Browse files
committed
[x86] try harder to scalarize a vector load with extracted integer op uses
This is a retry of b4b97ec - that was reverted because it could cause miscompiles by illegally reordering memory operations. A new test based on #53695 is added here to verify we do not have that same problem. extract_vec_elt (load X), C --> scalar load (X+C) As noted in the comment, DAGCombiner has this fold -- and the code in this patch is adapted from DAGCombiner::scalarizeExtractedVectorLoad() -- but x86 should benefit even if the loaded vector has other uses as long as we apply some other x86-specific conditions. The motivating example from #50310 is shown in vec_int_to_fp.ll. Fixes #50310 Fixes #53695 Differential Revision: https://reviews.llvm.org/D118376
1 parent 83ccce6 commit c486b82

11 files changed

+430
-615
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43231,6 +43231,35 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
4323143231
}
4323243232
}
4323343233

43234+
// If this extract is from a loaded vector value and will be used as an
43235+
// integer, that requires a potentially expensive XMM -> GPR transfer.
43236+
// Additionally, if we can convert to a scalar integer load, that will likely
43237+
// be folded into a subsequent integer op.
43238+
// Note: Unlike the related fold for this in DAGCombiner, this is not limited
43239+
// to a single-use of the loaded vector. For the reasons above, we
43240+
// expect this to be profitable even if it creates an extra load.
43241+
bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
43242+
return Use->getOpcode() == ISD::STORE ||
43243+
Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
43244+
Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
43245+
});
43246+
auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
43247+
if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
43248+
SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
43249+
!LikelyUsedAsVector) {
43250+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43251+
SDValue NewPtr =
43252+
TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
43253+
unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
43254+
MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
43255+
Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
43256+
SDValue Load =
43257+
DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
43258+
LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
43259+
DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
43260+
return Load;
43261+
}
43262+
4323443263
return SDValue();
4323543264
}
4323643265

llvm/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010
define <4 x i32> @test(<4 x i32>* %p) {
1111
; CHECK-LABEL: test:
1212
; CHECK: # %bb.0:
13-
; CHECK-NEXT: movaps (%rdi), %xmm0
14-
; CHECK-NEXT: extractps $2, %xmm0, %eax
15-
; CHECK-NEXT: cmpl $3, %eax
16-
; CHECK-NEXT: je .LBB0_2
17-
; CHECK-NEXT: # %bb.1:
13+
; CHECK-NEXT: cmpl $3, 8(%rdi)
14+
; CHECK-NEXT: je .LBB0_1
15+
; CHECK-NEXT: # %bb.2:
1816
; CHECK-NEXT: xorps %xmm0, %xmm0
19-
; CHECK-NEXT: .LBB0_2:
17+
; CHECK-NEXT: retq
18+
; CHECK-NEXT: .LBB0_1:
19+
; CHECK-NEXT: movaps (%rdi), %xmm0
2020
; CHECK-NEXT: retq
2121
%v = load <4 x i32>, <4 x i32>* %p
2222
%e = extractelement <4 x i32> %v, i32 2

llvm/test/CodeGen/X86/avx512-cvt.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -148,18 +148,12 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
148148
define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
149149
; NODQ-LABEL: slto4f32_mem:
150150
; NODQ: # %bb.0:
151-
; NODQ-NEXT: vmovdqu (%rdi), %xmm0
152-
; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1
153-
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
154-
; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
155-
; NODQ-NEXT: vmovq %xmm0, %rax
156-
; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
157-
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
158-
; NODQ-NEXT: vmovq %xmm1, %rax
159-
; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
160-
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
161-
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
162-
; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
151+
; NODQ-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
152+
; NODQ-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1
153+
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
154+
; NODQ-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
155+
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
156+
; NODQ-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
163157
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
164158
; NODQ-NEXT: retq
165159
;

llvm/test/CodeGen/X86/bitcast-vector-bool.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -542,10 +542,8 @@ define i32 @bitcast_v64i8_to_v2i32(<64 x i8> %a0) nounwind {
542542
; AVX512: # %bb.0:
543543
; AVX512-NEXT: vpmovb2m %zmm0, %k0
544544
; AVX512-NEXT: kmovq %k0, -{{[0-9]+}}(%rsp)
545-
; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
546-
; AVX512-NEXT: vmovd %xmm0, %ecx
547-
; AVX512-NEXT: vpextrd $1, %xmm0, %eax
548-
; AVX512-NEXT: addl %ecx, %eax
545+
; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax
546+
; AVX512-NEXT: addl -{{[0-9]+}}(%rsp), %eax
549547
; AVX512-NEXT: vzeroupper
550548
; AVX512-NEXT: retq
551549
%1 = icmp slt <64 x i8> %a0, zeroinitializer

llvm/test/CodeGen/X86/extractelement-load.ll

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -301,33 +301,35 @@ define void @subextract_broadcast_load_constant(<2 x i16>* nocapture %0, i16* no
301301
ret void
302302
}
303303

304+
; A scalar load is favored over a XMM->GPR register transfer in this example.
305+
304306
define i32 @multi_use_load_scalarization(<4 x i32>* %p) nounwind {
305307
; X32-SSE2-LABEL: multi_use_load_scalarization:
306308
; X32-SSE2: # %bb.0:
307309
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
310+
; X32-SSE2-NEXT: movl (%ecx), %eax
308311
; X32-SSE2-NEXT: movdqu (%ecx), %xmm0
309312
; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
310-
; X32-SSE2-NEXT: movd %xmm0, %eax
311313
; X32-SSE2-NEXT: psubd %xmm1, %xmm0
312314
; X32-SSE2-NEXT: movdqa %xmm0, (%ecx)
313315
; X32-SSE2-NEXT: retl
314316
;
315317
; X64-SSSE3-LABEL: multi_use_load_scalarization:
316318
; X64-SSSE3: # %bb.0:
319+
; X64-SSSE3-NEXT: movl (%rdi), %eax
317320
; X64-SSSE3-NEXT: movdqu (%rdi), %xmm0
318321
; X64-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
319-
; X64-SSSE3-NEXT: movd %xmm0, %eax
320322
; X64-SSSE3-NEXT: psubd %xmm1, %xmm0
321323
; X64-SSSE3-NEXT: movdqa %xmm0, (%rdi)
322324
; X64-SSSE3-NEXT: retq
323325
;
324326
; X64-AVX-LABEL: multi_use_load_scalarization:
325327
; X64-AVX: # %bb.0:
328+
; X64-AVX-NEXT: movl (%rdi), %eax
326329
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
327330
; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
328-
; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1
329-
; X64-AVX-NEXT: vmovdqa %xmm1, (%rdi)
330-
; X64-AVX-NEXT: vmovd %xmm0, %eax
331+
; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
332+
; X64-AVX-NEXT: vmovdqa %xmm0, (%rdi)
331333
; X64-AVX-NEXT: retq
332334
%v = load <4 x i32>, <4 x i32>* %p, align 1
333335
%v1 = add <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1>
@@ -336,6 +338,12 @@ define i32 @multi_use_load_scalarization(<4 x i32>* %p) nounwind {
336338
ret i32 %r
337339
}
338340

341+
; This test is reduced from a C source example that showed a miscompile:
342+
; https://github.com/llvm/llvm-project/issues/53695
343+
; The scalarized loads from 'zero' in the AVX asm must occur before
344+
; the vector store to 'zero' overwrites the values.
345+
; If compiled to a binary, this test should return 0 if correct.
346+
339347
@n1 = local_unnamed_addr global <8 x i32> <i32 0, i32 42, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0>, align 32
340348
@zero = internal unnamed_addr global <8 x i32> zeroinitializer, align 32
341349

@@ -419,21 +427,21 @@ define i32 @main() nounwind {
419427
; X64-AVX1-NEXT: subq $64, %rsp
420428
; X64-AVX1-NEXT: movq n1@GOTPCREL(%rip), %rax
421429
; X64-AVX1-NEXT: vmovaps (%rax), %ymm0
422-
; X64-AVX1-NEXT: vmovaps zero(%rip), %xmm1
430+
; X64-AVX1-NEXT: movl zero+4(%rip), %ecx
431+
; X64-AVX1-NEXT: movl zero+8(%rip), %eax
423432
; X64-AVX1-NEXT: vmovaps %ymm0, zero(%rip)
424433
; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
425434
; X64-AVX1-NEXT: vmovaps %ymm0, (%rsp)
426435
; X64-AVX1-NEXT: vmovaps (%rsp), %ymm0
427-
; X64-AVX1-NEXT: vextractps $2, %xmm1, %eax
428-
; X64-AVX1-NEXT: vextractps $2, %xmm0, %ecx
429-
; X64-AVX1-NEXT: xorl %edx, %edx
430-
; X64-AVX1-NEXT: divl %ecx
431-
; X64-AVX1-NEXT: movl %eax, %ecx
432-
; X64-AVX1-NEXT: vextractps $1, %xmm1, %eax
433-
; X64-AVX1-NEXT: vextractps $1, %xmm0, %esi
436+
; X64-AVX1-NEXT: vextractps $2, %xmm0, %esi
434437
; X64-AVX1-NEXT: xorl %edx, %edx
435438
; X64-AVX1-NEXT: divl %esi
436-
; X64-AVX1-NEXT: addl %ecx, %eax
439+
; X64-AVX1-NEXT: movl %eax, %esi
440+
; X64-AVX1-NEXT: vextractps $1, %xmm0, %edi
441+
; X64-AVX1-NEXT: movl %ecx, %eax
442+
; X64-AVX1-NEXT: xorl %edx, %edx
443+
; X64-AVX1-NEXT: divl %edi
444+
; X64-AVX1-NEXT: addl %esi, %eax
437445
; X64-AVX1-NEXT: movq %rbp, %rsp
438446
; X64-AVX1-NEXT: popq %rbp
439447
; X64-AVX1-NEXT: vzeroupper
@@ -447,21 +455,21 @@ define i32 @main() nounwind {
447455
; X64-AVX2-NEXT: subq $64, %rsp
448456
; X64-AVX2-NEXT: movq n1@GOTPCREL(%rip), %rax
449457
; X64-AVX2-NEXT: vmovaps (%rax), %ymm0
450-
; X64-AVX2-NEXT: vmovaps zero(%rip), %xmm1
458+
; X64-AVX2-NEXT: movl zero+4(%rip), %ecx
459+
; X64-AVX2-NEXT: movl zero+8(%rip), %eax
451460
; X64-AVX2-NEXT: vmovaps %ymm0, zero(%rip)
452461
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
453462
; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
454463
; X64-AVX2-NEXT: vmovaps (%rsp), %ymm0
455-
; X64-AVX2-NEXT: vextractps $2, %xmm1, %eax
456-
; X64-AVX2-NEXT: vextractps $2, %xmm0, %ecx
457-
; X64-AVX2-NEXT: xorl %edx, %edx
458-
; X64-AVX2-NEXT: divl %ecx
459-
; X64-AVX2-NEXT: movl %eax, %ecx
460-
; X64-AVX2-NEXT: vextractps $1, %xmm1, %eax
461-
; X64-AVX2-NEXT: vextractps $1, %xmm0, %esi
464+
; X64-AVX2-NEXT: vextractps $2, %xmm0, %esi
462465
; X64-AVX2-NEXT: xorl %edx, %edx
463466
; X64-AVX2-NEXT: divl %esi
464-
; X64-AVX2-NEXT: addl %ecx, %eax
467+
; X64-AVX2-NEXT: movl %eax, %esi
468+
; X64-AVX2-NEXT: vextractps $1, %xmm0, %edi
469+
; X64-AVX2-NEXT: movl %ecx, %eax
470+
; X64-AVX2-NEXT: xorl %edx, %edx
471+
; X64-AVX2-NEXT: divl %edi
472+
; X64-AVX2-NEXT: addl %esi, %eax
465473
; X64-AVX2-NEXT: movq %rbp, %rsp
466474
; X64-AVX2-NEXT: popq %rbp
467475
; X64-AVX2-NEXT: vzeroupper

llvm/test/CodeGen/X86/oddsubvector.ll

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -161,46 +161,46 @@ define <16 x i32> @PR42819(<8 x i32>* %a0) {
161161
define void @PR42833() {
162162
; SSE2-LABEL: PR42833:
163163
; SSE2: # %bb.0:
164-
; SSE2-NEXT: movdqa c+144(%rip), %xmm1
165-
; SSE2-NEXT: movdqa c+128(%rip), %xmm0
166-
; SSE2-NEXT: movd %xmm0, %eax
167-
; SSE2-NEXT: addl b(%rip), %eax
164+
; SSE2-NEXT: movl b(%rip), %eax
165+
; SSE2-NEXT: movdqa c+144(%rip), %xmm0
166+
; SSE2-NEXT: movdqa c+128(%rip), %xmm1
167+
; SSE2-NEXT: addl c+128(%rip), %eax
168168
; SSE2-NEXT: movd %eax, %xmm2
169169
; SSE2-NEXT: movd %eax, %xmm3
170-
; SSE2-NEXT: paddd %xmm0, %xmm3
170+
; SSE2-NEXT: paddd %xmm1, %xmm3
171171
; SSE2-NEXT: movdqa d+144(%rip), %xmm4
172-
; SSE2-NEXT: psubd %xmm1, %xmm4
173-
; SSE2-NEXT: paddd %xmm1, %xmm1
174-
; SSE2-NEXT: movdqa %xmm0, %xmm5
175-
; SSE2-NEXT: paddd %xmm0, %xmm5
172+
; SSE2-NEXT: psubd %xmm0, %xmm4
173+
; SSE2-NEXT: paddd %xmm0, %xmm0
174+
; SSE2-NEXT: movdqa %xmm1, %xmm5
175+
; SSE2-NEXT: paddd %xmm1, %xmm5
176176
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
177-
; SSE2-NEXT: movdqa %xmm1, c+144(%rip)
177+
; SSE2-NEXT: movdqa %xmm0, c+144(%rip)
178178
; SSE2-NEXT: movaps %xmm5, c+128(%rip)
179-
; SSE2-NEXT: movdqa c+160(%rip), %xmm1
179+
; SSE2-NEXT: movdqa c+160(%rip), %xmm0
180180
; SSE2-NEXT: movdqa c+176(%rip), %xmm3
181181
; SSE2-NEXT: movdqa d+160(%rip), %xmm5
182182
; SSE2-NEXT: movdqa d+176(%rip), %xmm6
183183
; SSE2-NEXT: movdqa d+128(%rip), %xmm7
184-
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
185-
; SSE2-NEXT: psubd %xmm0, %xmm7
184+
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
185+
; SSE2-NEXT: psubd %xmm1, %xmm7
186186
; SSE2-NEXT: psubd %xmm3, %xmm6
187-
; SSE2-NEXT: psubd %xmm1, %xmm5
187+
; SSE2-NEXT: psubd %xmm0, %xmm5
188188
; SSE2-NEXT: movdqa %xmm5, d+160(%rip)
189189
; SSE2-NEXT: movdqa %xmm6, d+176(%rip)
190190
; SSE2-NEXT: movdqa %xmm4, d+144(%rip)
191191
; SSE2-NEXT: movdqa %xmm7, d+128(%rip)
192192
; SSE2-NEXT: paddd %xmm3, %xmm3
193-
; SSE2-NEXT: paddd %xmm1, %xmm1
194-
; SSE2-NEXT: movdqa %xmm1, c+160(%rip)
193+
; SSE2-NEXT: paddd %xmm0, %xmm0
194+
; SSE2-NEXT: movdqa %xmm0, c+160(%rip)
195195
; SSE2-NEXT: movdqa %xmm3, c+176(%rip)
196196
; SSE2-NEXT: retq
197197
;
198198
; SSE42-LABEL: PR42833:
199199
; SSE42: # %bb.0:
200+
; SSE42-NEXT: movl b(%rip), %eax
200201
; SSE42-NEXT: movdqa c+144(%rip), %xmm0
201202
; SSE42-NEXT: movdqa c+128(%rip), %xmm1
202-
; SSE42-NEXT: movd %xmm1, %eax
203-
; SSE42-NEXT: addl b(%rip), %eax
203+
; SSE42-NEXT: addl c+128(%rip), %eax
204204
; SSE42-NEXT: movd %eax, %xmm2
205205
; SSE42-NEXT: paddd %xmm1, %xmm2
206206
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
@@ -232,20 +232,20 @@ define void @PR42833() {
232232
;
233233
; AVX1-LABEL: PR42833:
234234
; AVX1: # %bb.0:
235-
; AVX1-NEXT: vmovdqa c+128(%rip), %xmm0
236-
; AVX1-NEXT: vmovd %xmm0, %eax
237-
; AVX1-NEXT: addl b(%rip), %eax
238-
; AVX1-NEXT: vmovd %eax, %xmm1
239-
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
240-
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
235+
; AVX1-NEXT: movl b(%rip), %eax
236+
; AVX1-NEXT: addl c+128(%rip), %eax
237+
; AVX1-NEXT: vmovd %eax, %xmm0
238+
; AVX1-NEXT: vmovdqa c+128(%rip), %xmm1
239+
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
240+
; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2
241241
; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3
242242
; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3
243243
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
244-
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
244+
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
245245
; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2
246246
; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
247-
; AVX1-NEXT: vmovups %ymm1, c+128(%rip)
248-
; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
247+
; AVX1-NEXT: vmovups %ymm0, c+128(%rip)
248+
; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0
249249
; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1
250250
; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
251251
; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1
@@ -314,20 +314,20 @@ define void @PR42833() {
314314
;
315315
; XOP-LABEL: PR42833:
316316
; XOP: # %bb.0:
317-
; XOP-NEXT: vmovdqa c+128(%rip), %xmm0
318-
; XOP-NEXT: vmovd %xmm0, %eax
319-
; XOP-NEXT: addl b(%rip), %eax
320-
; XOP-NEXT: vmovd %eax, %xmm1
321-
; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1
322-
; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2
317+
; XOP-NEXT: movl b(%rip), %eax
318+
; XOP-NEXT: addl c+128(%rip), %eax
319+
; XOP-NEXT: vmovd %eax, %xmm0
320+
; XOP-NEXT: vmovdqa c+128(%rip), %xmm1
321+
; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm0
322+
; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm2
323323
; XOP-NEXT: vmovdqa c+144(%rip), %xmm3
324324
; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3
325325
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
326-
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
326+
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
327327
; XOP-NEXT: vmovdqa d+144(%rip), %xmm2
328328
; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2
329-
; XOP-NEXT: vmovups %ymm1, c+128(%rip)
330-
; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
329+
; XOP-NEXT: vmovups %ymm0, c+128(%rip)
330+
; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0
331331
; XOP-NEXT: vmovdqa d+128(%rip), %xmm1
332332
; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0
333333
; XOP-NEXT: vmovdqa d+176(%rip), %xmm1

llvm/test/CodeGen/X86/pr45378.ll

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,28 +76,23 @@ define i1 @parseHeaders2_scalar_and(i64 * %ptr) nounwind {
7676
; SSE2-LABEL: parseHeaders2_scalar_and:
7777
; SSE2: # %bb.0:
7878
; SSE2-NEXT: movdqu (%rdi), %xmm0
79-
; SSE2-NEXT: movq %xmm0, %rax
8079
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
81-
; SSE2-NEXT: movq %xmm0, %rcx
82-
; SSE2-NEXT: testq %rcx, %rax
80+
; SSE2-NEXT: movq %xmm0, %rax
81+
; SSE2-NEXT: testq %rax, (%rdi)
8382
; SSE2-NEXT: sete %al
8483
; SSE2-NEXT: retq
8584
;
8685
; SSE41-LABEL: parseHeaders2_scalar_and:
8786
; SSE41: # %bb.0:
88-
; SSE41-NEXT: movdqu (%rdi), %xmm0
89-
; SSE41-NEXT: movq %xmm0, %rax
90-
; SSE41-NEXT: pextrq $1, %xmm0, %rcx
91-
; SSE41-NEXT: testq %rcx, %rax
87+
; SSE41-NEXT: movq (%rdi), %rax
88+
; SSE41-NEXT: testq %rax, 8(%rdi)
9289
; SSE41-NEXT: sete %al
9390
; SSE41-NEXT: retq
9491
;
9592
; AVX-LABEL: parseHeaders2_scalar_and:
9693
; AVX: # %bb.0:
97-
; AVX-NEXT: vmovdqu (%rdi), %xmm0
98-
; AVX-NEXT: vmovq %xmm0, %rax
99-
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
100-
; AVX-NEXT: testq %rcx, %rax
94+
; AVX-NEXT: movq (%rdi), %rax
95+
; AVX-NEXT: testq %rax, 8(%rdi)
10196
; AVX-NEXT: sete %al
10297
; AVX-NEXT: retq
10398
%vptr = bitcast i64 * %ptr to <2 x i64> *

0 commit comments

Comments
 (0)