-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[PowerPC] Add phony subregisters to cover the high half of the VSX registers. #94628
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PowerPC] Add phony subregisters to cover the high half of the VSX registers. #94628
Conversation
@llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-backend-powerpc Author: Stefan Pintilie (stefanp-ibm) ChangesOn PowerPC we have VSX registers which overlap with floating point registers. Note: This patch is still Work in Progress as there are a number of LIT Full diff: https://github.com/llvm/llvm-project/pull/94628.diff 4 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9e8da59615df..a2d089f32cab 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -435,7 +435,91 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
}
- assert(checkAllSuperRegsMarked(Reserved));
+ // Mark phony regsiters for the VSR high bits as reserved so that they are
+ // not used.
+ Reserved.set(PPC::FH0);
+ Reserved.set(PPC::FH1);
+ Reserved.set(PPC::FH2);
+ Reserved.set(PPC::FH3);
+ Reserved.set(PPC::FH4);
+ Reserved.set(PPC::FH5);
+ Reserved.set(PPC::FH6);
+ Reserved.set(PPC::FH7);
+ Reserved.set(PPC::FH8);
+ Reserved.set(PPC::FH9);
+ Reserved.set(PPC::FH10);
+ Reserved.set(PPC::FH11);
+ Reserved.set(PPC::FH12);
+ Reserved.set(PPC::FH13);
+ Reserved.set(PPC::FH14);
+ Reserved.set(PPC::FH15);
+ Reserved.set(PPC::FH16);
+ Reserved.set(PPC::FH17);
+ Reserved.set(PPC::FH18);
+ Reserved.set(PPC::FH19);
+ Reserved.set(PPC::FH20);
+ Reserved.set(PPC::FH21);
+ Reserved.set(PPC::FH22);
+ Reserved.set(PPC::FH23);
+ Reserved.set(PPC::FH24);
+ Reserved.set(PPC::FH25);
+ Reserved.set(PPC::FH26);
+ Reserved.set(PPC::FH27);
+ Reserved.set(PPC::FH28);
+ Reserved.set(PPC::FH29);
+ Reserved.set(PPC::FH30);
+ Reserved.set(PPC::FH31);
+
+ Reserved.set(PPC::VFH0);
+ Reserved.set(PPC::VFH1);
+ Reserved.set(PPC::VFH2);
+ Reserved.set(PPC::VFH3);
+ Reserved.set(PPC::VFH4);
+ Reserved.set(PPC::VFH5);
+ Reserved.set(PPC::VFH6);
+ Reserved.set(PPC::VFH7);
+ Reserved.set(PPC::VFH8);
+ Reserved.set(PPC::VFH9);
+ Reserved.set(PPC::VFH10);
+ Reserved.set(PPC::VFH11);
+ Reserved.set(PPC::VFH12);
+ Reserved.set(PPC::VFH13);
+ Reserved.set(PPC::VFH14);
+ Reserved.set(PPC::VFH15);
+ Reserved.set(PPC::VFH16);
+ Reserved.set(PPC::VFH17);
+ Reserved.set(PPC::VFH18);
+ Reserved.set(PPC::VFH19);
+ Reserved.set(PPC::VFH20);
+ Reserved.set(PPC::VFH21);
+ Reserved.set(PPC::VFH22);
+ Reserved.set(PPC::VFH23);
+ Reserved.set(PPC::VFH24);
+ Reserved.set(PPC::VFH25);
+ Reserved.set(PPC::VFH26);
+ Reserved.set(PPC::VFH27);
+ Reserved.set(PPC::VFH28);
+ Reserved.set(PPC::VFH29);
+ Reserved.set(PPC::VFH30);
+ Reserved.set(PPC::VFH31);
+
+ assert(checkAllSuperRegsMarked(Reserved,
+ {PPC::FH0, PPC::FH1, PPC::FH2, PPC::FH3,
+ PPC::FH4, PPC::FH5, PPC::FH6, PPC::FH7,
+ PPC::FH8, PPC::FH9, PPC::FH10, PPC::FH11,
+ PPC::FH12, PPC::FH13, PPC::FH14, PPC::FH15,
+ PPC::FH16, PPC::FH17, PPC::FH18, PPC::FH19,
+ PPC::FH20, PPC::FH21, PPC::FH22, PPC::FH23,
+ PPC::FH24, PPC::FH25, PPC::FH26, PPC::FH27,
+ PPC::FH28, PPC::FH29, PPC::FH30, PPC::FH31,
+ PPC::VFH0, PPC::VFH1, PPC::VFH2, PPC::VFH3,
+ PPC::VFH4, PPC::VFH5, PPC::VFH6, PPC::VFH7,
+ PPC::VFH8, PPC::VFH9, PPC::VFH10, PPC::VFH11,
+ PPC::VFH12, PPC::VFH13, PPC::VFH14, PPC::VFH15,
+ PPC::VFH16, PPC::VFH17, PPC::VFH18, PPC::VFH19,
+ PPC::VFH20, PPC::VFH21, PPC::VFH22, PPC::VFH23,
+ PPC::VFH24, PPC::VFH25, PPC::VFH26, PPC::VFH27,
+ PPC::VFH28, PPC::VFH29, PPC::VFH30, PPC::VFH31}));
return Reserved;
}
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 8a37e40414ee..30e936a157e0 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -17,6 +17,7 @@ def sub_un : SubRegIndex<1, 3>;
def sub_32 : SubRegIndex<32>;
def sub_32_hi_phony : SubRegIndex<32,32>;
def sub_64 : SubRegIndex<64>;
+def sub_64_hi_phony : SubRegIndex<64,64>;
def sub_vsx0 : SubRegIndex<128>;
def sub_vsx1 : SubRegIndex<128, 128>;
def sub_gp8_x0 : SubRegIndex<64>;
@@ -77,19 +78,19 @@ class VF<bits<5> num, string n> : PPCReg<n> {
}
// VR - One of the 32 128-bit vector registers
-class VR<VF SubReg, string n> : PPCReg<n> {
+class VR<VF SubReg, VF SubRegH, string n> : PPCReg<n> {
let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
let HWEncoding{5} = 0;
- let SubRegs = [SubReg];
- let SubRegIndices = [sub_64];
+ let SubRegs = [SubReg, SubRegH];
+ let SubRegIndices = [sub_64, sub_64_hi_phony];
}
// VSRL - One of the 32 128-bit VSX registers that overlap with the scalar
// floating-point registers.
-class VSRL<FPR SubReg, string n> : PPCReg<n> {
+class VSRL<FPR SubReg, FPR SubRegH, string n> : PPCReg<n> {
let HWEncoding = SubReg.HWEncoding;
- let SubRegs = [SubReg];
- let SubRegIndices = [sub_64];
+ let SubRegs = [SubReg, SubRegH];
+ let SubRegIndices = [sub_64, sub_64_hi_phony];
}
// VSXReg - One of the VSX registers in the range vs32-vs63 with numbering
@@ -155,6 +156,12 @@ foreach Index = 0-31 in {
DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
}
+let isArtificial = 1 in {
+ foreach Index = 0-31 in {
+ def FH#Index : FPR<-1, "">;
+ }
+}
+
// Floating-point pair registers
foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
def Fpair#Index : FPPair<"fp"#Index, Index>;
@@ -168,15 +175,21 @@ foreach Index = 0-31 in {
DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
}
+let isArtificial = 1 in {
+ foreach Index = 0-31 in {
+ def VFH#Index : VF<-1, "">;
+ }
+}
+
// Vector registers
foreach Index = 0-31 in {
- def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
- DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+ def V#Index : VR<!cast<VF>("VF"#Index), !cast<VF>("VFH"#Index), "v"#Index>,
+ DwarfRegNum<[!add(Index, 77)]>;
}
// VSX registers
foreach Index = 0-31 in {
- def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
+ def VSL#Index : VSRL<!cast<FPR>("F"#Index), !cast<FPR>("FH"#Index), "vs"#Index>,
DwarfRegAlias<!cast<FPR>("F"#Index)>;
}
diff --git a/llvm/test/CodeGen/PowerPC/frem.ll b/llvm/test/CodeGen/PowerPC/frem.ll
index 8cb68e60f7f9..19b4b1c9cdf9 100644
--- a/llvm/test/CodeGen/PowerPC/frem.ll
+++ b/llvm/test/CodeGen/PowerPC/frem.ll
@@ -70,7 +70,6 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
; CHECK-NEXT: xscvspdpn 2, 0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: nop
-; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1
; CHECK-NEXT: xxmrghd 0, 1, 61
; CHECK-NEXT: xscvspdpn 1, 62
; CHECK-NEXT: xscvspdpn 2, 63
@@ -84,7 +83,6 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
; CHECK-NEXT: xscvspdpn 2, 0
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: nop
-; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1
; CHECK-NEXT: xxmrghd 0, 61, 1
; CHECK-NEXT: lxv 63, 80(1) # 16-byte Folded Reload
; CHECK-NEXT: lxv 62, 64(1) # 16-byte Folded Reload
@@ -124,11 +122,8 @@ define <2 x double> @frem2x64(<2 x double> %a, <2 x double> %b) {
; CHECK-NEXT: xscpsgndp 61, 1, 1
; CHECK-NEXT: xxswapd 1, 62
; CHECK-NEXT: xxswapd 2, 63
-; CHECK-NEXT: # kill: def $f1 killed $f1 killed $vsl1
-; CHECK-NEXT: # kill: def $f2 killed $f2 killed $vsl2
; CHECK-NEXT: bl fmod
; CHECK-NEXT: nop
-; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1
; CHECK-NEXT: xxmrghd 34, 61, 1
; CHECK-NEXT: lxv 63, 64(1) # 16-byte Folded Reload
; CHECK-NEXT: lxv 62, 48(1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir b/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
new file mode 100644
index 000000000000..28a4d0347f10
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
@@ -0,0 +1,63 @@
+# RUN: llc -mcpu=pwr10 -O3 -ppc-track-subreg-liveness -verify-machineinstrs \
+# RUN: -mtriple=powerpc64le-unknown-linux-gnu -run-pass=greedy,virtregrewriter \
+# RUN: -debug-only=regalloc -o - %s 2>&1 >/dev/null | FileCheck %s
+
+# Keep track of all of the lanemasks for various subregsiters.
+#
+# CHECK: %3 [80r,80d:0) 0@80r L000000000000000C [80r,80d:0) 0@80r weight:0.000000e+00
+# CHECK: %4 [96r,96d:0) 0@96r L0000000000003000 [96r,96d:0) 0@96r weight:0.000000e+00
+# CHECK: %5 [112r,112d:0) 0@112r L000000000000000C [112r,112d:0) 0@112r weight:0.000000e+00
+# CHECK: %6 [128r,128d:0) 0@128r L0000000000003000 [128r,128d:0) 0@128r weight:0.000000e+00
+# CHECK: %7 [144r,144d:0) 0@144r L0000000000000004 [144r,144d:0) 0@144r weight:0.000000e+00
+# CHECK: %8 [160r,160d:0) 0@160r L0000000000001000 [160r,160d:0) 0@160r weight:0.000000e+00
+# CHECK: %9 [176r,176d:0) 0@176r L0000000000000004 [176r,176d:0) 0@176r weight:0.000000e+00
+# CHECK: %10 [192r,192d:0) 0@192r L0000000000001000 [192r,192d:0) 0@192r weight:0.000000e+00
+# CHECK: %11 [208r,208d:0) 0@208r L0000000000004000 [208r,208d:0) 0@208r weight:0.000000e+00
+# CHECK: %12 [224r,224d:0) 0@224r L0000000000010000 [224r,224d:0) 0@224r weight:0.000000e+00
+# CHECK: %13 [240r,240d:0) 0@240r L000000000000300C [240r,240d:0) 0@240r weight:0.000000e+00
+# CHECK: %14 [256r,256d:0) 0@256r L000000000003C000 [256r,256d:0) 0@256r weight:0.000000e+00
+
+
+# CHECK: 0B bb.0
+# CHECK-NEXT: liveins
+# CHECK-NEXT: 16B %0:vsrc = COPY $v2
+# CHECK-NEXT: 32B %float:fprc = COPY %0.sub_64:vsrc
+# CHECK-NEXT: 48B dead undef %pair.sub_vsx0:vsrprc = COPY $v2
+# CHECK-NEXT: 64B undef %15.sub_vsx1:vsrprc = COPY $v3
+# CHECK-NEXT: 80B dead undef %3.sub_vsx0:vsrprc = COPY %0:vsrc
+# CHECK-NEXT: 96B dead undef %4.sub_vsx1:vsrprc = COPY %0:vsrc
+# CHECK-NEXT: 112B dead undef %5.sub_vsx0:accrc = COPY %0:vsrc
+# CHECK-NEXT: 128B dead undef %6.sub_vsx1:accrc = COPY %0:vsrc
+# CHECK-NEXT: 144B dead undef %7.sub_64:vsrprc = COPY %float:fprc
+# CHECK-NEXT: 160B dead undef %8.sub_vsx1_then_sub_64:vsrprc = COPY %float:fprc
+# CHECK-NEXT: 176B dead undef %9.sub_64:accrc = COPY %float:fprc
+# CHECK-NEXT: 192B dead undef %10.sub_vsx1_then_sub_64:accrc = COPY %float:fprc
+# CHECK-NEXT: 208B dead undef %11.sub_pair1_then_sub_64:accrc = COPY %float:fprc
+# CHECK-NEXT: 224B dead undef %12.sub_pair1_then_sub_vsx1_then_sub_64:accrc = COPY %float:fprc
+# CHECK-NEXT: 240B dead undef %13.sub_pair0:accrc = COPY %15:vsrprc
+# CHECK-NEXT: 256B dead undef %14.sub_pair1:accrc = COPY %15:vsrprc
+
+
+---
+name: test
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $v2, $v3
+ %0:vsrc = COPY $v2
+ %float:fprc = COPY %0.sub_64
+ undef %pair.sub_vsx0:vsrprc = COPY $v2
+ undef %pair.sub_vsx1:vsrprc = COPY $v3
+ undef %1.sub_vsx0:vsrprc = COPY %0
+ undef %2.sub_vsx1:vsrprc = COPY %0
+ undef %3.sub_vsx0:accrc = COPY %0
+ undef %4.sub_vsx1:accrc = COPY %0
+ undef %5.sub_64:vsrprc = COPY %float
+ undef %6.sub_vsx1_then_sub_64:vsrprc = COPY %float
+ undef %7.sub_64:accrc = COPY %float
+ undef %8.sub_vsx1_then_sub_64:accrc = COPY %float
+ undef %9.sub_pair1_then_sub_64:accrc = COPY %float
+ undef %10.sub_pair1_then_sub_vsx1_then_sub_64:accrc = COPY %float
+ undef %11.sub_pair0:accrc = COPY %pair
+ undef %12.sub_pair1:accrc = COPY %pair
+...
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
LHSVals.pruneSubRegValues(LHS, ShrinkMask); | ||
RHSVals.pruneSubRegValues(LHS, ShrinkMask); | ||
} else if (TrackSubRegLiveness && !CP.getDstIdx() && CP.getSrcIdx()) { | ||
LHS.createSubRangeFrom(LIS->getVNInfoAllocator(), | ||
CP.getNewRC()->getLaneMask(), LHS); | ||
mergeSubRangeInto(LHS, RHS, TRI->getSubRegIndexLaneMask(CP.getSrcIdx()), CP, | ||
CP.getDstIdx()); | ||
LHSVals.pruneMainSegments(LHS, ShrinkMainRange); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs a new MIR test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have now added a test case with test/CodeGen/PowerPC/subreg-coalescer.mir
@@ -0,0 +1,63 @@ | |||
# RUN: llc -mcpu=pwr10 -O3 -ppc-track-subreg-liveness -verify-machineinstrs \ | |||
# RUN: -mtriple=powerpc64le-unknown-linux-gnu -run-pass=greedy,virtregrewriter \ | |||
# RUN: -debug-only=regalloc -o - %s 2>&1 >/dev/null | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you really need the debug output? Need REQUIRES: asserts if so.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I fixed this test in the patch that has gone in with #94363 .
@@ -0,0 +1,63 @@ | |||
# RUN: llc -mcpu=pwr10 -O3 -ppc-track-subreg-liveness -verify-machineinstrs \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use -verify-regalloc instead of -verify-machineinstrs. Also shouldn't need -O3
CP.getNewRC()->getLaneMask(), LHS); | ||
mergeSubRangeInto(LHS, RHS, TRI->getSubRegIndexLaneMask(CP.getSrcIdx()), CP, | ||
CP.getDstIdx()); | ||
LHSVals.pruneMainSegments(LHS, ShrinkMainRange); | ||
LHSVals.pruneSubRegValues(LHS, ShrinkMask); | ||
RHSVals.pruneSubRegValues(LHS, ShrinkMask); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks no need to prune RHS
's subregvals, since RHS doesn't contain any subranges.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure what you mean by RHS doesn't contain any subregs. I think I remember seeing subregs on both the RHS and LHS in IR. Taken from one of our tests for example:
%48:vsrprc = INSERT_SUBREG %45, killed %47, %subreg.sub_vsx0
This is probably not what you mean...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure what you mean by RHS doesn't contain any subregs.
RHS
as a LiveInterval doesn't contain any subranges(if it does, another branch prior this one has handled that). Looking into pruneSubRegValues
, the method will iterate over LiveInterval's subranges and shrink range accordingly. Since RHS
doesn't contain any subranges, so RHS
should not be changed.
@@ -3672,6 +3672,14 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { | |||
// having stale segments. | |||
LHSVals.pruneMainSegments(LHS, ShrinkMainRange); | |||
|
|||
LHSVals.pruneSubRegValues(LHS, ShrinkMask); | |||
RHSVals.pruneSubRegValues(LHS, ShrinkMask); | |||
} else if (TrackSubRegLiveness && !CP.getDstIdx() && CP.getSrcIdx()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add comment like "MachineVerifier expects subrange to be created if a register is partially defined when subregliveness is enabled"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If CP has a src idx, I would expect the source value to have subranges when TrackSubRegLiveness
is on.
Bottom line, I don't see why we need this case and thus echoing what @arsenm said wrt adding a test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC, it has something to do with SUBREG_TO_REG
in which neither Src nor Dst has subranges.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SUBREG_TO_REG coalescing is pretty broken. It would be great if you could migrate to not using it at all. It's fundamentally flawed and I think should be removed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure we can achieve it in short term. Grep a bit, PowerPC has the most lines of code involving SUBREG_TO_REG
comparing to X86 and AArch64.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure we can achieve it in short term. Grep a bit, PowerPC has the most lines of code involving
SUBREG_TO_REG
comparing to X86 and AArch64.
I agree we should be moving away from SUBREG_TO_REG
however I also agree that for PowerPC this is more of a long term goal. For this patch we will probably have to continue like this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to split this change and test into its own PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to split this change and test into its own PR
The issue with splitting this test and change away is that the test passes anyway unless it is accompanied by the subregsiter changes in this PR. So, unfortunately, if I split out the test and add it to top of trunk main the test will PASS even without the changes to RegisterCoalescer.cpp
. The error is exposed by the subregister changes that we made in PPCRegisterInfo.td
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's fine, pre-committing passing tests is good
6cb5de0
to
7740d7e
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this patch is ready for another review. I will also remove the [WIP] from the title as it is no longer a work in progress.
LHSVals.pruneSubRegValues(LHS, ShrinkMask); | ||
RHSVals.pruneSubRegValues(LHS, ShrinkMask); | ||
} else if (TrackSubRegLiveness && !CP.getDstIdx() && CP.getSrcIdx()) { | ||
LHS.createSubRangeFrom(LIS->getVNInfoAllocator(), | ||
CP.getNewRC()->getLaneMask(), LHS); | ||
mergeSubRangeInto(LHS, RHS, TRI->getSubRegIndexLaneMask(CP.getSrcIdx()), CP, | ||
CP.getDstIdx()); | ||
LHSVals.pruneMainSegments(LHS, ShrinkMainRange); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have now added a test case with test/CodeGen/PowerPC/subreg-coalescer.mir
@@ -3672,6 +3672,14 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { | |||
// having stale segments. | |||
LHSVals.pruneMainSegments(LHS, ShrinkMainRange); | |||
|
|||
LHSVals.pruneSubRegValues(LHS, ShrinkMask); | |||
RHSVals.pruneSubRegValues(LHS, ShrinkMask); | |||
} else if (TrackSubRegLiveness && !CP.getDstIdx() && CP.getSrcIdx()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure we can achieve it in short term. Grep a bit, PowerPC has the most lines of code involving
SUBREG_TO_REG
comparing to X86 and AArch64.
I agree we should be moving away from SUBREG_TO_REG
however I also agree that for PowerPC this is more of a long term goal. For this patch we will probably have to continue like this.
CP.getNewRC()->getLaneMask(), LHS); | ||
mergeSubRangeInto(LHS, RHS, TRI->getSubRegIndexLaneMask(CP.getSrcIdx()), CP, | ||
CP.getDstIdx()); | ||
LHSVals.pruneMainSegments(LHS, ShrinkMainRange); | ||
LHSVals.pruneSubRegValues(LHS, ShrinkMask); | ||
RHSVals.pruneSubRegValues(LHS, ShrinkMask); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure what you mean by RHS doesn't contain any subregs. I think I remember seeing subregs on both the RHS and LHS in IR. Taken from one of our tests for example:
%48:vsrprc = INSERT_SUBREG %45, killed %47, %subreg.sub_vsx0
This is probably not what you mean...
@@ -0,0 +1,27 @@ | |||
# RUN: llc -mtriple powerpc64le-unknown-linux-gnu -mcpu=pwr8 -x mir < %s \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we pre-commit this test case? Without this PR, this case should fail verification?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we pre-commit this test case? Without this PR, this case should fail verification?
Well, the issue is that without the fix in RegisterCoalescer.cpp
the test will fail. However, without this entire patch this test will still pass. We need the fix in the register classes for this test to start failing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I see. The HasDisjunctSubRegs
bit is false without this PR, so that MRI->shouldTrackSubRegLiveness(...)
returns false.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The change in the PPC part looks mostly good to me except one unexpected change for the dwarfregnum.
@@ -434,7 +434,6 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { | |||
} | |||
} | |||
} | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: remove this unrelated change?
def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>, | ||
DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>; | ||
def V#Index : VR<!cast<VF>("VF"#Index), !cast<VF>("VFH"#Index), "v"#Index>, | ||
DwarfRegNum<[!add(Index, 77)]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why we remove the second !add(Index, 77)
? I think it is still needed for 32/64 bit?
bf8be4b
to
eb2dc76
Compare
@@ -750,25 +750,21 @@ entry: | |||
define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) { | |||
; CHECK-64-LABEL: testDoubleImm1: | |||
; CHECK-64: # %bb.0: # %entry | |||
; CHECK-64-NEXT: # kill: def $f1 killed $f1 def $vsl1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this looks like degradation.
> renamable $f1 = COPY $f1, implicit-def $vsl1
Identity copy: renamable $f1 = COPY $f1, implicit-def $vsl1
replace by: renamable $f1 = KILL $f1, implicit-def $vsl1
The Kill
pseudo opcode is inserted from this register rewrite.
Kill
can tell the later pass that the f1/vsl1 is killed before instruction COPY
. See VirtRegRewriter::handleIdentityCopy()
. I don't think adding phony registers for the high part of vsl1 should break this part.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Kill can tell the later pass that the f1/vsl1 is killed before instruction COPY
I think f1/vsl1 is killed at the COPY, not before, so that this identity COPY is replaced by a KILL
instruction. I'm wondering how we get rid of the identity COPY after adding phony registers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took a look at this and the copy has changed which is why the kill is no longer emitted.
Before the change:
renamable $f1 = COPY $f1, implicit-def $vsl1
After the change:
renamable $f1 = COPY $f1
Basically, we are missing the implicit-def
. There is nothing wrong with not having the kill if the implicit-def
is not there. However, I'm not sure if the implicit-def
is necessary here or not.
A few more details here. This is as intended. As you mentioned, the kill is added by handleIdentityCopy()
and is guarded by if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 2) {
. If we are missing the implicit-def
we have exactly two operands and so we don't add the kill.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
However, I'm not sure if the implicit-def is necessary here or not.
Maybe the implicit-def is still necessary at least for the case where the other part of the super register is not allocatable, i.e., once f1 is defined, the vsl1 should also be defined as the phony part can not be allocated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the phony lane isn't alive at the COPY, I think it's ok we don't have the implicit-def vsl1
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the phony lane isn't alive at the COPY, I think it's ok we don't have the
implicit-def vsl1
.
I think that is what is happening here. I took a look at one of the smaller tests combine-fneg.ll
to see and the MIR looks like this:
liveins: $f1, $v2, $v3
undef %3.sub_64:vsrc = COPY $f1
%4:vsrc = XXPERMDI killed %3:vsrc, %3:vsrc, 0 // This is the only use of %3
The only reason we use the VSRC register is because that is the type of register that XXPERMDI requires. The other lane is never used and the value in $f1
is just splatted into %4
.
eb2dc76
to
df2892d
Compare
Note that this change goes on top of: However, I can't seem to get git to show only the diff after that change. So, not compared to main but compared to [stefanp/RegCoalesceFix]. Therefore, this is the link to see that smaller diff: Once that patch goes in then I can rebase properly and compare against main. |
@@ -155,6 +156,18 @@ foreach Index = 0-31 in { | |||
DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; | |||
} | |||
|
|||
let isArtificial = 1 in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isArtificial field looks undocumented in Target.td, can you add some description to it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes! I have added a comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I more meant it needs docs in Target.td, not just this one use
7980fea
to
6158697
Compare
The patch that this depended on has gone in. |
6158697
to
eaff954
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I looked through all the tests that this patch updated and it seem to be doing what was expected so that part LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know anything about ppc but seems fine
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The solution and test change look good to me.
…SX registers. On PowerPC we have VSX registers which overlap with floating point registers. However, the floating point registers only overlap with half of each VSX register while the other half is never used alone. This patch adds phony registers for the other half of the VSX registers in order to fully cover them and to make sure that the lane masks are not the same for the VSX and the floating point register. Note: This patch is still Work in Progress as there are a number of LIT failures that need to be investigated.
0b3478b
to
c1ef2be
Compare
/cherry-pick 53c37f3 |
Failed to cherry-pick: 53c37f3 https://github.com/llvm/llvm-project/actions/runs/10199739617 Please manually backport the fix and push it to your github fork. Once this is done, please create a pull request |
…gisters. (llvm#94628) On PowerPC there are 128 bit VSX registers. These registers are half overlapped with 64 bit floating point registers (FPR). The 64 bit half of the VXS register that does not overlap with the FPR does not overlap with any other register class. The FPR are the only subregisters of the VSX registers but they do not fully cover the 128 bit super register. This leads to incorrect lane masks being created. This patch adds phony registers for the other half of the VSX registers in order to fully cover them and to make sure that the lane masks are not the same for the VSX and the floating point register. (cherry picked from commit 53c37f3)
…gisters. (llvm#94628) On PowerPC there are 128 bit VSX registers. These registers are half overlapped with 64 bit floating point registers (FPR). The 64 bit half of the VXS register that does not overlap with the FPR does not overlap with any other register class. The FPR are the only subregisters of the VSX registers but they do not fully cover the 128 bit super register. This leads to incorrect lane masks being created. This patch adds phony registers for the other half of the VSX registers in order to fully cover them and to make sure that the lane masks are not the same for the VSX and the floating point register. (cherry picked from commit 53c37f3)
…gisters. (llvm#94628) On PowerPC there are 128 bit VSX registers. These registers are half overlapped with 64 bit floating point registers (FPR). The 64 bit half of the VXS register that does not overlap with the FPR does not overlap with any other register class. The FPR are the only subregisters of the VSX registers but they do not fully cover the 128 bit super register. This leads to incorrect lane masks being created. This patch adds phony registers for the other half of the VSX registers in order to fully cover them and to make sure that the lane masks are not the same for the VSX and the floating point register. (cherry picked from commit 53c37f3)
On PowerPC there are 128 bit VSX registers. These registers are half overlapped with 64 bit floating point registers (FPR). The 64 bit half of the VXS register that does not overlap with the FPR does not overlap with any other register class. The FPR are the only subregisters of the VSX registers but they do not fully cover the 128 bit super register. This leads to incorrect lane masks being created.
This patch adds phony registers for the other half of the VSX registers in order to fully cover them and to make sure that the lane masks are not the same for the VSX and the floating point register.