Skip to content

Commit dc1fade

Browse files
authored
[MCP] Enhance MCP copy Instruction removal for special case(reapply) (#74239)
Machine Copy Propagation Pass may lose some opportunities to further remove the redundant copy instructions during the ForwardCopyPropagateBlock procedure. When we Clobber a "Def" register, we also need to remove the record from the copy maps that indicates "Src" defined "Def" to ensure the correct semantics of the ClobberRegister function. This patch reapplies #70778 and addresses the corner case bug #73512 specific to the AMDGPU backend. Additionally, it refines the criteria for removing empty records from the copy maps, thereby enhancing overall safety. For more information, please see the C++ test case generated code in "vector.body" after the MCP Pass: https://gcc.godbolt.org/z/nK4oMaWv5.
1 parent c019ed9 commit dc1fade

14 files changed

+98
-58
lines changed

llvm/lib/CodeGen/MachineCopyPropagation.cpp

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,46 @@ class CopyTracker {
175175
if (MachineInstr *MI = I->second.MI) {
176176
std::optional<DestSourcePair> CopyOperands =
177177
isCopyInstr(*MI, TII, UseCopyInstr);
178-
markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
179-
TRI);
178+
179+
MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
180+
MCRegister Src = CopyOperands->Source->getReg().asMCReg();
181+
182+
markRegsUnavailable(Def, TRI);
183+
184+
// Since we clobber the destination of a copy, the semantic of Src's
185+
// "DefRegs" to contain Def is no longer effectual. We will also need
186+
// to remove the record from the copy maps that indicates Src defined
187+
// Def. Failing to do so might cause the target to miss some
188+
// opportunities to further eliminate redundant copy instructions.
189+
// Consider the following sequence during the
190+
// ForwardCopyPropagateBlock procedure:
191+
// L1: r0 = COPY r9 <- TrackMI
192+
// L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker)
193+
// L3: use r0 <- Remove L2 from MaybeDeadCopies
194+
// L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
195+
// L5: r0 = COPY r8 <- Remove NopCopy
196+
for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
197+
auto SrcCopy = Copies.find(SrcUnit);
198+
if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
199+
// If SrcCopy defines multiple values, we only need
200+
// to erase the record for Def in DefRegs.
201+
for (auto itr = SrcCopy->second.DefRegs.begin();
202+
itr != SrcCopy->second.DefRegs.end(); itr++) {
203+
if (*itr == Def) {
204+
SrcCopy->second.DefRegs.erase(itr);
205+
// If DefReg becomes empty after removal, we can remove the
206+
// SrcCopy from the tracker's copy maps. We only remove those
207+
// entries solely record the Def is defined by Src. If an
208+
// entry also contains the definition record of other Def'
209+
// registers, it cannot be cleared.
210+
if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
211+
Copies.erase(SrcCopy);
212+
}
213+
break;
214+
}
215+
}
216+
}
217+
}
180218
}
181219
// Now we can erase the copy.
182220
Copies.erase(I);
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
2+
# RUN: llc -march=amdgcn -mcpu=gfx900 %s -o - -run-pass machine-cp -verify-machineinstrs | FileCheck %s
3+
4+
# The MachineCopyPropagation Pass should not treat the subsequent
5+
# instruction "$sgpr2_sgpr3 = COPY $sgpr6_sgpr7" as a NopCopy.
6+
# For detailed information, please refer to issue 73512.
7+
---
8+
name: foo
9+
body: |
10+
bb.0.entry:
11+
liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
12+
13+
; CHECK-LABEL: name: foo
14+
; CHECK: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
15+
; CHECK-NEXT: {{ $}}
16+
; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
17+
; CHECK-NEXT: S_NOP 0, implicit-def $sgpr0
18+
; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
19+
; CHECK-NEXT: S_NOP 0, implicit $sgpr2_sgpr3
20+
$sgpr2_sgpr3 = COPY $sgpr6_sgpr7
21+
$sgpr0 = COPY $sgpr3
22+
S_NOP 0, implicit-def $sgpr0
23+
$sgpr3 = COPY killed $sgpr5
24+
$sgpr2_sgpr3 = COPY $sgpr6_sgpr7
25+
S_NOP 0, implicit $sgpr2_sgpr3
26+
...

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
637637
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
638638
; CHECK-NEXT: vfabs.v v16, v24, v0.t
639639
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
640-
; CHECK-NEXT: vmv1r.v v0, v1
641640
; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t
642641
; CHECK-NEXT: frflags a0
643642
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma

llvm/test/CodeGen/X86/shift-i128.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
347347
; i686-NEXT: movl %edx, %ecx
348348
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
349349
; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
350-
; i686-NEXT: movl %edx, %ecx
351350
; i686-NEXT: shrl %cl, %esi
352351
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
353352
; i686-NEXT: movl %esi, 28(%ecx)
@@ -489,7 +488,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
489488
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
490489
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
491490
; i686-NEXT: shrdl %cl, %esi, %ebx
492-
; i686-NEXT: movl %edx, %ecx
493491
; i686-NEXT: sarl %cl, %ebp
494492
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
495493
; i686-NEXT: movl %ebp, 28(%ecx)
@@ -623,11 +621,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
623621
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
624622
; i686-NEXT: shll %cl, %edi
625623
; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
626-
; i686-NEXT: movl %ecx, %edi
627624
; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
628625
; i686-NEXT: negl %ebp
629626
; i686-NEXT: movl 64(%esp,%ebp), %esi
630-
; i686-NEXT: movl %edi, %ecx
631627
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
632628
; i686-NEXT: movl (%esp), %edi # 4-byte Reload
633629
; i686-NEXT: shldl %cl, %edi, %esi

llvm/test/CodeGen/X86/shift-i256.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
7878
; CHECK-NEXT: movl %eax, %ecx
7979
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
8080
; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
81-
; CHECK-NEXT: movl %eax, %ecx
8281
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
8382
; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
8483
; CHECK-NEXT: movl 28(%esp,%ebp), %edx

llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1201,7 +1201,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
12011201
; X86-NEXT: movl %edx, %ebp
12021202
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
12031203
; X86-NEXT: movl %eax, %ebx
1204-
; X86-NEXT: addl %ebp, %ebx
1204+
; X86-NEXT: addl %edx, %ebx
12051205
; X86-NEXT: adcl $0, %ebp
12061206
; X86-NEXT: movl %ecx, %eax
12071207
; X86-NEXT: movl %ecx, %esi

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14447,7 +14447,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1444714447
; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1444814448
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
1444914449
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14450-
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10
1445114450
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
1445214451
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
1445314452
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
@@ -14483,7 +14482,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1448314482
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
1448414483
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1448514484
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm11
14486-
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
14485+
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
1448714486
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
1448814487
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
1448914488
; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
@@ -14516,7 +14515,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1451614515
; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1451714516
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0
1451814517
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14519-
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
14518+
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
1452014519
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
1452114520
; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
1452214521
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
@@ -14530,8 +14529,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1453014529
; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
1453114530
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
1453214531
; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14533-
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7]
14534-
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm18
14532+
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
14533+
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm18
1453514534
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25
1453614535
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
1453714536
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
@@ -14738,7 +14737,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1473814737
; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm27
1473914738
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
1474014739
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19
14741-
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16
1474214740
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
1474314741
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
1474414742
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
@@ -14747,7 +14745,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1474714745
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
1474814746
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
1474914747
; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
14750-
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm1
1475114748
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
1475214749
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
1475314750
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
@@ -14823,14 +14820,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
1482314820
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12
1482414821
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
1482514822
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14826-
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
14827-
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm13
14828-
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14823+
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
14824+
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm13
14825+
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1482914826
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
1483014827
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
1483114828
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5
14832-
; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4
14833-
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
14829+
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm4
14830+
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
1483414831
; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm10
1483514832
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
1483614833
; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]

llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8490,12 +8490,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
84908490
; AVX512F-NEXT: vpermt2q %zmm31, %zmm23, %zmm12
84918491
; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
84928492
; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8493-
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
8493+
; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0
84948494
; AVX512F-NEXT: vpermt2q %zmm31, %zmm24, %zmm0
84958495
; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
84968496
; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
84978497
; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8498-
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
8498+
; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0
84998499
; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm0
85008500
; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
85018501
; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2480,7 +2480,6 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
24802480
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
24812481
; SSE-NEXT: psllq $48, %xmm0
24822482
; SSE-NEXT: packuswb %xmm1, %xmm0
2483-
; SSE-NEXT: movdqa %xmm7, %xmm4
24842483
; SSE-NEXT: movdqa %xmm7, %xmm1
24852484
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
24862485
; SSE-NEXT: pandn %xmm5, %xmm1
@@ -2537,7 +2536,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
25372536
; SSE-NEXT: pandn %xmm1, %xmm2
25382537
; SSE-NEXT: movdqa %xmm8, %xmm1
25392538
; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2540-
; SSE-NEXT: movdqa %xmm4, %xmm0
2539+
; SSE-NEXT: movdqa %xmm7, %xmm0
25412540
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
25422541
; SSE-NEXT: pandn %xmm4, %xmm0
25432542
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,13 +1181,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
11811181
; SSE-NEXT: pandn %xmm9, %xmm4
11821182
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11831183
; SSE-NEXT: movdqa %xmm9, %xmm11
1184-
; SSE-NEXT: pand %xmm1, %xmm11
1185-
; SSE-NEXT: movdqa %xmm1, %xmm4
1184+
; SSE-NEXT: pand %xmm10, %xmm11
1185+
; SSE-NEXT: movdqa %xmm10, %xmm4
11861186
; SSE-NEXT: pandn %xmm0, %xmm4
11871187
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11881188
; SSE-NEXT: movdqa 96(%rdi), %xmm13
11891189
; SSE-NEXT: movdqa %xmm13, %xmm4
1190-
; SSE-NEXT: pand %xmm1, %xmm4
1190+
; SSE-NEXT: pand %xmm10, %xmm4
11911191
; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11921192
; SSE-NEXT: movdqa 176(%rdi), %xmm4
11931193
; SSE-NEXT: movdqa %xmm4, %xmm10

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,8 +1024,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
10241024
; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
10251025
; SSE-NEXT: movdqa %xmm9, %xmm7
10261026
; SSE-NEXT: pand %xmm14, %xmm7
1027-
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1028-
; SSE-NEXT: movdqa %xmm6, %xmm15
1027+
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1028+
; SSE-NEXT: movdqa %xmm5, %xmm15
10291029
; SSE-NEXT: pand %xmm14, %xmm15
10301030
; SSE-NEXT: movdqa %xmm11, %xmm3
10311031
; SSE-NEXT: pandn %xmm8, %xmm3
@@ -2148,7 +2148,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
21482148
; SSE-NEXT: movdqa %xmm5, %xmm9
21492149
; SSE-NEXT: pand %xmm13, %xmm9
21502150
; SSE-NEXT: por %xmm0, %xmm9
2151-
; SSE-NEXT: movdqa %xmm6, %xmm3
21522151
; SSE-NEXT: movdqa %xmm6, %xmm0
21532152
; SSE-NEXT: pand %xmm13, %xmm0
21542153
; SSE-NEXT: pandn %xmm10, %xmm13
@@ -2185,7 +2184,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
21852184
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
21862185
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
21872186
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2188-
; SSE-NEXT: pandn %xmm3, %xmm2
2187+
; SSE-NEXT: pandn %xmm6, %xmm2
21892188
; SSE-NEXT: por %xmm10, %xmm2
21902189
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
21912190
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
@@ -5451,19 +5450,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
54515450
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
54525451
; SSE-NEXT: pand %xmm14, %xmm6
54535452
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5454-
; SSE-NEXT: movdqa %xmm0, %xmm3
5453+
; SSE-NEXT: movdqa %xmm14, %xmm3
54555454
; SSE-NEXT: movdqa %xmm11, %xmm6
54565455
; SSE-NEXT: pandn %xmm11, %xmm3
54575456
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5458-
; SSE-NEXT: pand %xmm0, %xmm5
5457+
; SSE-NEXT: pand %xmm14, %xmm5
54595458
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
54605459
; SSE-NEXT: movdqa %xmm2, %xmm3
5461-
; SSE-NEXT: pand %xmm0, %xmm3
5460+
; SSE-NEXT: pand %xmm14, %xmm3
54625461
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5463-
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5464-
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5465-
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5466-
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5462+
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5463+
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5464+
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5465+
; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
54675466
; SSE-NEXT: pandn %xmm1, %xmm0
54685467
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
54695468
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11212,7 +11212,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
1121211212
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1121311213
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9
1121411214
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15
11215-
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23
1121611215
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
1121711216
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
1121811217
; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9
@@ -11289,7 +11288,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
1128911288
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10
1129011289
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9
1129111290
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15
11292-
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31
1129311291
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
1129411292
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1129511293
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
@@ -11302,7 +11300,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
1130211300
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
1130311301
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1130411302
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9
11305-
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11
11303+
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11
1130611304
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15
1130711305
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
1130811306
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]

0 commit comments

Comments
 (0)