Skip to content

Commit 816b3cc

Browse files
committed
SplitKit: Fix rematerialization undoing subclass based split
This fixes an allocation failure in the new test. In cases where getLargestLegalSuperClass can inflate the register class, rematerialization could effectively undo a split which was done to inflate the register class, if the defining instruction can only write a subclass and the use can read the superclass. Some of the x86 tests changes look like improvements, but some are likely regressions. I'm not entirely sure this is the correct place to fix this. It also seems more complicated than necessary, but the decision to change the register class is far removed from the point where the decision to split the virtual register is made. I'm also also not sure if this should be considering the register classes of all the use indexes in getUseSlots, rather than just checking if this use index instruction reads the register.
1 parent 360630b commit 816b3cc

File tree

7 files changed

+271
-71
lines changed

7 files changed

+271
-71
lines changed

llvm/lib/CodeGen/SplitKit.cpp

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,38 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
588588
return Def;
589589
}
590590

591+
bool SplitEditor::rematWillIncreaseRestriction(const MachineInstr *DefMI,
592+
MachineBasicBlock &MBB,
593+
SlotIndex UseIdx) const {
594+
if (!DefMI)
595+
return false;
596+
597+
const MachineInstr *UseMI = LIS.getInstructionFromIndex(UseIdx);
598+
if (!UseMI)
599+
return false;
600+
601+
Register Reg = Edit->getReg();
602+
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
603+
604+
// We want to find the register class that can be inflated to after the split
605+
// occurs in recomputeRegClass
606+
const TargetRegisterClass *SuperRC =
607+
TRI.getLargestLegalSuperClass(RC, *MBB.getParent());
608+
609+
// We want to compute the static register class constraint for the instruction
610+
// def. If it is a smaller subclass than getLargestLegalSuperClass at the use
611+
// site, then rematerializing it will increase the constraints.
612+
const TargetRegisterClass *DefConstrainRC =
613+
DefMI->getRegClassConstraintEffectForVReg(Reg, SuperRC, &TII, &TRI,
614+
/*ExploreBundle=*/true);
615+
616+
const TargetRegisterClass *UseConstrainRC =
617+
UseMI->getRegClassConstraintEffectForVReg(Reg, SuperRC, &TII, &TRI,
618+
/*ExploreBundle=*/true);
619+
620+
return UseConstrainRC->hasSubClass(DefConstrainRC);
621+
}
622+
591623
VNInfo *SplitEditor::defFromParent(unsigned RegIdx, const VNInfo *ParentVNI,
592624
SlotIndex UseIdx, MachineBasicBlock &MBB,
593625
MachineBasicBlock::iterator I) {
@@ -609,9 +641,16 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx, const VNInfo *ParentVNI,
609641
LiveRangeEdit::Remat RM(ParentVNI);
610642
RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
611643
if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
612-
Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late);
613-
++NumRemats;
614-
DidRemat = true;
644+
if (!rematWillIncreaseRestriction(RM.OrigMI, MBB, UseIdx)) {
645+
Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late);
646+
++NumRemats;
647+
DidRemat = true;
648+
} else {
649+
LLVM_DEBUG(
650+
dbgs() << "skipping rematerialize of " << printReg(Reg) << " at "
651+
<< UseIdx
652+
<< " since it will increase register class restrictions\n");
653+
}
615654
}
616655
}
617656
if (!DidRemat) {

llvm/lib/CodeGen/SplitKit.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,12 @@ class LLVM_LIBRARY_VISIBILITY SplitEditor {
379379
/// predecessors in case of a phi definition.
380380
void forceRecomputeVNI(const VNInfo &ParentVNI);
381381

382+
/// \return true if rematerializing \p DefMI at \p UseIdx will make the
383+
/// register class requirements stricter at the use.
384+
bool rematWillIncreaseRestriction(const MachineInstr *DefMI,
385+
MachineBasicBlock &MBB,
386+
SlotIndex UseIdx) const;
387+
382388
/// defFromParent - Define Reg from ParentVNI at UseIdx using either
383389
/// rematerialization or a COPY from parent. Return the new value.
384390
VNInfo *defFromParent(unsigned RegIdx, const VNInfo *ParentVNI,
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-regalloc -start-before=greedy,2 -stop-before=virtregrewriter,2 -o - %s | FileCheck %s
3+
# FIXME: Assert if run to end
4+
5+
# The V_MFMA_F32_32X32X1F32_vgprcd_e64 as written requires 66 VGPRs
6+
# to allocate, but the register budget (with a forced AGPR usage and
7+
# occupancy 4) permits 64 VGPRs and 64 AGPRs, so we need to force a
8+
# copy from VGPR to AGPR. The minimal copies required for this need to
9+
# copy %3 and %4 V_MOV_B32s into temporary AGPRs for use by the MFMA.
10+
11+
# Previously we would attempt a register subclass based split, but
12+
# immediately rematerialize the V_MOV_B32 into the new temporary
13+
# register, defeating the point of introducing the split. The
14+
# allocation would fail since it's 2 registers short.
15+
16+
---
17+
name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
18+
tracksRegLiveness: true
19+
machineFunctionInfo:
20+
isEntryFunction: true
21+
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
22+
stackPtrOffsetReg: '$sgpr32'
23+
argumentInfo:
24+
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
25+
kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
26+
workGroupIDX: { reg: '$sgpr6' }
27+
privateSegmentWaveByteOffset: { reg: '$sgpr7' }
28+
workItemIDX: { reg: '$vgpr0' }
29+
occupancy: 4
30+
sgprForEXECCopy: '$sgpr100_sgpr101'
31+
body: |
32+
bb.0:
33+
liveins: $vgpr0, $sgpr4_sgpr5
34+
35+
; CHECK-LABEL: name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
36+
; CHECK: liveins: $vgpr0, $sgpr4_sgpr5
37+
; CHECK-NEXT: {{ $}}
38+
; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
39+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
40+
; CHECK-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
41+
; CHECK-NEXT: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 7, [[COPY]], implicit $exec
42+
; CHECK-NEXT: undef [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub28_sub29_sub30_sub31:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, [[V_LSHLREV_B32_e32_]], 112, 0, implicit $exec :: (load (s128), addrspace 1)
43+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub24_sub25_sub26_sub27:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, [[V_LSHLREV_B32_e32_]], 96, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
44+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub20_sub21_sub22_sub23:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, [[V_LSHLREV_B32_e32_]], 80, 0, implicit $exec :: (load (s128), addrspace 1)
45+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub16_sub17_sub18_sub19:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, [[V_LSHLREV_B32_e32_]], 64, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
46+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub12_sub13_sub14_sub15:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, [[V_LSHLREV_B32_e32_]], 48, 0, implicit $exec :: (load (s128), addrspace 1)
47+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub8_sub9_sub10_sub11:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, [[V_LSHLREV_B32_e32_]], 32, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
48+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub4_sub5_sub6_sub7:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, [[V_LSHLREV_B32_e32_]], 16, 0, implicit $exec :: (load (s128), addrspace 1)
49+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, [[V_LSHLREV_B32_e32_]], 0, 0, implicit $exec :: (load (s128), align 128, addrspace 1)
50+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
51+
; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
52+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_1024_align2 = V_MFMA_F32_32X32X1F32_mac_vgprcd_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[GLOBAL_LOAD_DWORDX4_SADDR]], 0, 0, 0, implicit $mode, implicit $exec
53+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_e32_1]]
54+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_e32_]]
55+
; CHECK-NEXT: early-clobber %5:vreg_1024_align2 = V_MFMA_F32_32X32X1F32_vgprcd_e64 [[COPY2]], [[COPY1]], [[GLOBAL_LOAD_DWORDX4_SADDR]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
56+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub2:vreg_1024_align2 = COPY %5.sub0
57+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub3:vreg_1024_align2 = COPY %5.sub1
58+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub4:vreg_1024_align2 = COPY %5.sub2
59+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub5:vreg_1024_align2 = COPY %5.sub3
60+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub6:vreg_1024_align2 = COPY %5.sub4
61+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub7:vreg_1024_align2 = COPY %5.sub5
62+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub8:vreg_1024_align2 = COPY %5.sub6
63+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub9:vreg_1024_align2 = COPY %5.sub7
64+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub10:vreg_1024_align2 = COPY %5.sub8
65+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub11:vreg_1024_align2 = COPY %5.sub9
66+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub12:vreg_1024_align2 = COPY %5.sub10
67+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub13:vreg_1024_align2 = COPY %5.sub11
68+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub14:vreg_1024_align2 = COPY %5.sub12
69+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub15:vreg_1024_align2 = COPY %5.sub13
70+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub16:vreg_1024_align2 = COPY %5.sub14
71+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub17:vreg_1024_align2 = COPY %5.sub15
72+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub18:vreg_1024_align2 = COPY %5.sub16
73+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub19:vreg_1024_align2 = COPY %5.sub17
74+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub20:vreg_1024_align2 = COPY %5.sub18
75+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub21:vreg_1024_align2 = COPY %5.sub19
76+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub22:vreg_1024_align2 = COPY %5.sub20
77+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub23:vreg_1024_align2 = COPY %5.sub21
78+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub24:vreg_1024_align2 = COPY %5.sub22
79+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub25:vreg_1024_align2 = COPY %5.sub23
80+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub26:vreg_1024_align2 = COPY %5.sub24
81+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub27:vreg_1024_align2 = COPY %5.sub25
82+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub28:vreg_1024_align2 = COPY %5.sub26
83+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub29:vreg_1024_align2 = COPY %5.sub27
84+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub30:vreg_1024_align2 = COPY %5.sub28
85+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]].sub31:vreg_1024_align2 = COPY %5.sub29
86+
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_1024_align2 = V_MFMA_F32_32X32X1F32_mac_vgprcd_e64 [[COPY2]], [[COPY1]], [[GLOBAL_LOAD_DWORDX4_SADDR]], 0, 0, 0, implicit $mode, implicit $exec
87+
; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
88+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_2]], [[GLOBAL_LOAD_DWORDX4_SADDR]].sub24_sub25_sub26_sub27, renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
89+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_2]], [[GLOBAL_LOAD_DWORDX4_SADDR]].sub28_sub29_sub30_sub31, renamable $sgpr0_sgpr1, 112, 0, implicit $exec :: (store (s128), addrspace 1)
90+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_2]], [[GLOBAL_LOAD_DWORDX4_SADDR]].sub16_sub17_sub18_sub19, renamable $sgpr0_sgpr1, 64, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
91+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_2]], [[GLOBAL_LOAD_DWORDX4_SADDR]].sub20_sub21_sub22_sub23, renamable $sgpr0_sgpr1, 80, 0, implicit $exec :: (store (s128), addrspace 1)
92+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_2]], [[GLOBAL_LOAD_DWORDX4_SADDR]].sub8_sub9_sub10_sub11, renamable $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
93+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_2]], [[GLOBAL_LOAD_DWORDX4_SADDR]].sub12_sub13_sub14_sub15, renamable $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
94+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_2]], [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0_sub1_sub2_sub3, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
95+
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_2]], [[GLOBAL_LOAD_DWORDX4_SADDR]].sub4_sub5_sub6_sub7, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
96+
; CHECK-NEXT: S_ENDPGM 0
97+
S_NOP 0, implicit-def $agpr0
98+
%0:vgpr_32 = COPY $vgpr0
99+
renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
100+
%1:vgpr_32 = V_LSHLREV_B32_e32 7, %0, implicit $exec
101+
undef %2.sub28_sub29_sub30_sub31:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, %1, 112, 0, implicit $exec :: (load (s128), addrspace 1)
102+
%2.sub24_sub25_sub26_sub27:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, %1, 96, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
103+
%2.sub20_sub21_sub22_sub23:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, %1, 80, 0, implicit $exec :: (load (s128), addrspace 1)
104+
%2.sub16_sub17_sub18_sub19:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, %1, 64, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
105+
%2.sub12_sub13_sub14_sub15:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, %1, 48, 0, implicit $exec :: (load (s128), addrspace 1)
106+
%2.sub8_sub9_sub10_sub11:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, %1, 32, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
107+
%2.sub4_sub5_sub6_sub7:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, %1, 16, 0, implicit $exec :: (load (s128), addrspace 1)
108+
%2.sub0_sub1_sub2_sub3:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4_SADDR renamable $sgpr0_sgpr1, %1, 0, 0, implicit $exec :: (load (s128), align 128, addrspace 1)
109+
%3:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
110+
%4:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
111+
%2:vreg_1024_align2 = V_MFMA_F32_32X32X1F32_mac_vgprcd_e64 %3, %4, %2, 0, 0, 0, implicit $mode, implicit $exec
112+
early-clobber %5:vreg_1024_align2 = V_MFMA_F32_32X32X1F32_vgprcd_e64 %3, %4, %2, 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
113+
%2.sub2:vreg_1024_align2 = COPY %5.sub0
114+
%2.sub3:vreg_1024_align2 = COPY %5.sub1
115+
%2.sub4:vreg_1024_align2 = COPY %5.sub2
116+
%2.sub5:vreg_1024_align2 = COPY %5.sub3
117+
%2.sub6:vreg_1024_align2 = COPY %5.sub4
118+
%2.sub7:vreg_1024_align2 = COPY %5.sub5
119+
%2.sub8:vreg_1024_align2 = COPY %5.sub6
120+
%2.sub9:vreg_1024_align2 = COPY %5.sub7
121+
%2.sub10:vreg_1024_align2 = COPY %5.sub8
122+
%2.sub11:vreg_1024_align2 = COPY %5.sub9
123+
%2.sub12:vreg_1024_align2 = COPY %5.sub10
124+
%2.sub13:vreg_1024_align2 = COPY %5.sub11
125+
%2.sub14:vreg_1024_align2 = COPY %5.sub12
126+
%2.sub15:vreg_1024_align2 = COPY %5.sub13
127+
%2.sub16:vreg_1024_align2 = COPY %5.sub14
128+
%2.sub17:vreg_1024_align2 = COPY %5.sub15
129+
%2.sub18:vreg_1024_align2 = COPY %5.sub16
130+
%2.sub19:vreg_1024_align2 = COPY %5.sub17
131+
%2.sub20:vreg_1024_align2 = COPY %5.sub18
132+
%2.sub21:vreg_1024_align2 = COPY %5.sub19
133+
%2.sub22:vreg_1024_align2 = COPY %5.sub20
134+
%2.sub23:vreg_1024_align2 = COPY %5.sub21
135+
%2.sub24:vreg_1024_align2 = COPY %5.sub22
136+
%2.sub25:vreg_1024_align2 = COPY %5.sub23
137+
%2.sub26:vreg_1024_align2 = COPY %5.sub24
138+
%2.sub27:vreg_1024_align2 = COPY %5.sub25
139+
%2.sub28:vreg_1024_align2 = COPY %5.sub26
140+
%2.sub29:vreg_1024_align2 = COPY %5.sub27
141+
%2.sub30:vreg_1024_align2 = COPY %5.sub28
142+
%2.sub31:vreg_1024_align2 = COPY %5.sub29
143+
%2:vreg_1024_align2 = V_MFMA_F32_32X32X1F32_mac_vgprcd_e64 %3, %4, %2, 0, 0, 0, implicit $mode, implicit $exec
144+
%6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
145+
GLOBAL_STORE_DWORDX4_SADDR %6, %2.sub24_sub25_sub26_sub27, renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
146+
GLOBAL_STORE_DWORDX4_SADDR %6, %2.sub28_sub29_sub30_sub31, renamable $sgpr0_sgpr1, 112, 0, implicit $exec :: (store (s128), addrspace 1)
147+
GLOBAL_STORE_DWORDX4_SADDR %6, %2.sub16_sub17_sub18_sub19, renamable $sgpr0_sgpr1, 64, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
148+
GLOBAL_STORE_DWORDX4_SADDR %6, %2.sub20_sub21_sub22_sub23, renamable $sgpr0_sgpr1, 80, 0, implicit $exec :: (store (s128), addrspace 1)
149+
GLOBAL_STORE_DWORDX4_SADDR %6, %2.sub8_sub9_sub10_sub11, renamable $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
150+
GLOBAL_STORE_DWORDX4_SADDR %6, %2.sub12_sub13_sub14_sub15, renamable $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
151+
GLOBAL_STORE_DWORDX4_SADDR %6, %2.sub0_sub1_sub2_sub3, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
152+
GLOBAL_STORE_DWORDX4_SADDR %6, %2.sub4_sub5_sub6_sub7, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
153+
S_ENDPGM 0
154+
155+
...

llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,7 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) {
106106
; AVX512: # %bb.0:
107107
; AVX512-NEXT: subq $24, %rsp
108108
; AVX512-NEXT: .cfi_def_cfa_offset 32
109-
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
110-
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
109+
; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
111110
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
112111
; AVX512-NEXT: callq use.v4.i32@PLT
113112
; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload

0 commit comments

Comments
 (0)