Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 045b217

Browse files
committed
[PPC64LE] More improvements to VSX swap optimization
This patch allows VSX swap optimization to succeed more frequently. Specifically, it is concerned with common code sequences that occur when copying a scalar floating-point value to a vector register. This patch currently handles cases where the floating-point value is already in a register, but does not yet handle loads (such as via an LXSDX scalar floating-point VSX load). That will be dealt with later. A typical case is when a scalar value comes in as a floating-point parameter. The value is copied into a virtual VSFRC register, and then a sequence of SUBREG_TO_REG and/or COPY operations will convert it to a full vector register of the class required by the context. If this vector register is then used as part of a lane-permuted computation, the original scalar value will be in the wrong lane. We can fix this by adding a swap operation following any widening SUBREG_TO_REG operation. Additional COPY operations may be needed around the swap operation in order to keep register assignment happy, but these are pro forma operations that will be removed by coalescing. If a scalar value is otherwise directly referenced in a computation (such as by one of the many XS* vector-scalar operations), we currently disable swap optimization. These operations are lane-sensitive by definition. A MentionsPartialVR flag is added for use in each swap table entry that mentions a scalar floating-point register without having special handling defined. A common idiom for PPC64LE is to convert a double-precision scalar to a vector by performing a splat operation. This ensures that the value can be referenced as V[0], as it would be for big endian, whereas just converting the scalar to a vector with a SUBREG_TO_REG operation leaves this value only in V[1]. A doubleword splat operation is one form of an XXPERMDI instruction, which takes one doubleword from a first operand and another doubleword from a second operand, with a two-bit selector operand indicating which doublewords are chosen. In the general case, an XXPERMDI can be permitted in a lane-swapped region provided that it is properly transformed to select the corresponding swapped values. This transformation is to reverse the order of the two input operands, and to reverse and complement the bits of the selector operand (derivation left as an exercise to the reader ;). A new test case that exercises the scalar-to-vector and generalized XXPERMDI transformations is added as CodeGen/PowerPC/swaps-le-5.ll. The patch also requires a change to CodeGen/PowerPC/swaps-le-3.ll to use CHECK-DAG instead of CHECK for two independent instructions that now appear in reverse order. There are two small unrelated changes that are added with this patch. First, the XXSLDWI instruction was incorrectly omitted from the list of lane-sensitive instructions; this is now fixed. Second, I observed that the same webs were being rejected over and over again for different reasons. Since it's sufficient to reject a web only once, I added a check for this to speed up the compilation time slightly. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242081 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 6d0b726 commit 045b217

File tree

3 files changed

+260
-23
lines changed

3 files changed

+260
-23
lines changed

lib/Target/PowerPC/PPCVSXSwapRemoval.cpp

Lines changed: 188 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ struct PPCVSXSwapEntry {
8080
unsigned int IsSwap : 1;
8181
unsigned int MentionsPhysVR : 1;
8282
unsigned int IsSwappable : 1;
83+
unsigned int MentionsPartialVR : 1;
8384
unsigned int SpecialHandling : 3;
8485
unsigned int WebRejected : 1;
8586
unsigned int WillRemove : 1;
@@ -91,7 +92,9 @@ enum SHValues {
9192
SH_INSERT,
9293
SH_NOSWAP_LD,
9394
SH_NOSWAP_ST,
94-
SH_SPLAT
95+
SH_SPLAT,
96+
SH_XXPERMDI,
97+
SH_COPYSCALAR
9598
};
9699

97100
struct PPCVSXSwapRemoval : public MachineFunctionPass {
@@ -167,6 +170,21 @@ struct PPCVSXSwapRemoval : public MachineFunctionPass {
167170
isRegInClass(Reg, &PPC::VRRCRegClass));
168171
}
169172

173+
// Return true iff the given register is a partial vector register.
174+
bool isScalarVecReg(unsigned Reg) {
175+
return (isRegInClass(Reg, &PPC::VSFRCRegClass) ||
176+
isRegInClass(Reg, &PPC::VSSRCRegClass));
177+
}
178+
179+
// Return true iff the given register mentions all or part of a
180+
// vector register. Also sets Partial to true if the mention
181+
// is for just the floating-point register overlap of the register.
182+
bool isAnyVecReg(unsigned Reg, bool &Partial) {
183+
if (isScalarVecReg(Reg))
184+
Partial = true;
185+
return isScalarVecReg(Reg) || isVecReg(Reg);
186+
}
187+
170188
public:
171189
// Main entry point for this pass.
172190
bool runOnMachineFunction(MachineFunction &MF) override {
@@ -223,12 +241,13 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
223241
for (MachineInstr &MI : MBB) {
224242

225243
bool RelevantInstr = false;
244+
bool Partial = false;
226245

227246
for (const MachineOperand &MO : MI.operands()) {
228247
if (!MO.isReg())
229248
continue;
230249
unsigned Reg = MO.getReg();
231-
if (isVecReg(Reg)) {
250+
if (isAnyVecReg(Reg, Partial)) {
232251
RelevantInstr = true;
233252
break;
234253
}
@@ -250,8 +269,13 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
250269
// Unless noted otherwise, an instruction is considered
251270
// safe for the optimization. There are a large number of
252271
// such true-SIMD instructions (all vector math, logical,
253-
// select, compare, etc.).
254-
SwapVector[VecIdx].IsSwappable = 1;
272+
// select, compare, etc.). However, if the instruction
273+
// mentions a partial vector register and does not have
274+
// special handling defined, it is not swappable.
275+
if (Partial)
276+
SwapVector[VecIdx].MentionsPartialVR = 1;
277+
else
278+
SwapVector[VecIdx].IsSwappable = 1;
255279
break;
256280
case PPC::XXPERMDI: {
257281
// This is a swap if it is of the form XXPERMDI t, s, s, 2.
@@ -269,25 +293,37 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
269293
VecIdx);
270294
if (trueReg1 == trueReg2)
271295
SwapVector[VecIdx].IsSwap = 1;
272-
}
296+
else {
297+
// We can still handle these if the two registers are not
298+
// identical, by adjusting the form of the XXPERMDI.
299+
SwapVector[VecIdx].IsSwappable = 1;
300+
SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
301+
}
273302
// This is a doubleword splat if it is of the form
274303
// XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we
275304
// must look through chains of copy-likes to find the source
276305
// register. We turn off the marking for mention of a physical
277306
// register, because splatting it is safe; the optimization
278-
// will not swap the value in the physical register.
279-
else if (immed == 0 || immed == 3) {
307+
// will not swap the value in the physical register. Whether
308+
// or not the two input registers are identical, we can handle
309+
// these by adjusting the form of the XXPERMDI.
310+
} else if (immed == 0 || immed == 3) {
311+
312+
SwapVector[VecIdx].IsSwappable = 1;
313+
SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
314+
280315
unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
281316
VecIdx);
282317
unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
283318
VecIdx);
284-
if (trueReg1 == trueReg2) {
285-
SwapVector[VecIdx].IsSwappable = 1;
319+
if (trueReg1 == trueReg2)
286320
SwapVector[VecIdx].MentionsPhysVR = 0;
287-
}
321+
322+
} else {
323+
// We can still handle these by adjusting the form of the XXPERMDI.
324+
SwapVector[VecIdx].IsSwappable = 1;
325+
SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
288326
}
289-
// Any other form of XXPERMDI is lane-sensitive and unsafe
290-
// for the optimization.
291327
break;
292328
}
293329
case PPC::LVX:
@@ -324,7 +360,32 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
324360
if (isVecReg(MI.getOperand(0).getReg()) &&
325361
isVecReg(MI.getOperand(1).getReg()))
326362
SwapVector[VecIdx].IsSwappable = 1;
363+
// If we have a copy from one scalar floating-point register
364+
// to another, we can accept this even if it is a physical
365+
// register. The only way this gets involved is if it feeds
366+
// a SUBREG_TO_REG, which is handled by introducing a swap.
367+
else if (isScalarVecReg(MI.getOperand(0).getReg()) &&
368+
isScalarVecReg(MI.getOperand(1).getReg()))
369+
SwapVector[VecIdx].IsSwappable = 1;
370+
break;
371+
case PPC::SUBREG_TO_REG: {
372+
// These are fine provided they are moving between full vector
373+
// register classes. If they are moving from a scalar
374+
// floating-point class to a vector class, we can handle those
375+
// as well, provided we introduce a swap. It is generally the
376+
// case that we will introduce fewer swaps than we remove, but
377+
// (FIXME) a cost model could be used. However, introduced
378+
// swaps could potentially be CSEd, so this is not trivial.
379+
if (isVecReg(MI.getOperand(0).getReg()) &&
380+
isVecReg(MI.getOperand(2).getReg()))
381+
SwapVector[VecIdx].IsSwappable = 1;
382+
else if (isVecReg(MI.getOperand(0).getReg()) &&
383+
isScalarVecReg(MI.getOperand(2).getReg())) {
384+
SwapVector[VecIdx].IsSwappable = 1;
385+
SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR;
386+
}
327387
break;
388+
}
328389
case PPC::VSPLTB:
329390
case PPC::VSPLTH:
330391
case PPC::VSPLTW:
@@ -425,6 +486,10 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
425486
case PPC::VUPKLSW:
426487
case PPC::XXMRGHW:
427488
case PPC::XXMRGLW:
489+
// XXSLDWI could be replaced by a general permute with one of three
490+
// permute control vectors (for shift values 1, 2, 3). However,
491+
// VPERM has a more restrictive register class.
492+
case PPC::XXSLDWI:
428493
case PPC::XXSPLTW:
429494
break;
430495
}
@@ -501,18 +566,20 @@ void PPCVSXSwapRemoval::formWebs() {
501566
DEBUG(MI->dump());
502567

503568
// It's sufficient to walk vector uses and join them to their unique
504-
// definitions. In addition, check *all* vector register operands
505-
// for physical regs.
569+
// definitions. In addition, check full vector register operands
570+
// for physical regs. We exclude partial-vector register operands
571+
// because we can handle them if copied to a full vector.
506572
for (const MachineOperand &MO : MI->operands()) {
507573
if (!MO.isReg())
508574
continue;
509575

510576
unsigned Reg = MO.getReg();
511-
if (!isVecReg(Reg))
577+
if (!isVecReg(Reg) && !isScalarVecReg(Reg))
512578
continue;
513579

514580
if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
515-
SwapVector[EntryIdx].MentionsPhysVR = 1;
581+
if (!(MI->isCopy() && isScalarVecReg(Reg)))
582+
SwapVector[EntryIdx].MentionsPhysVR = 1;
516583
continue;
517584
}
518585

@@ -545,15 +612,21 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
545612
for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
546613
int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
547614

548-
// Reject webs containing mentions of physical registers, or containing
549-
// operations that we don't know how to handle in a lane-permuted region.
615+
// If representative is already rejected, don't waste further time.
616+
if (SwapVector[Repr].WebRejected)
617+
continue;
618+
619+
// Reject webs containing mentions of physical or partial registers, or
620+
// containing operations that we don't know how to handle in a lane-
621+
// permuted region.
550622
if (SwapVector[EntryIdx].MentionsPhysVR ||
623+
SwapVector[EntryIdx].MentionsPartialVR ||
551624
!(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) {
552625

553626
SwapVector[Repr].WebRejected = 1;
554627

555628
DEBUG(dbgs() <<
556-
format("Web %d rejected for physreg, subreg, or not swap[pable]\n",
629+
format("Web %d rejected for physreg, partial reg, or not swap[pable]\n",
557630
Repr));
558631
DEBUG(dbgs() << " in " << EntryIdx << ": ");
559632
DEBUG(SwapVector[EntryIdx].VSEMI->dump());
@@ -588,7 +661,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
588661
}
589662
}
590663

591-
// Reject webs than contain swapping stores that are fed by something
664+
// Reject webs that contain swapping stores that are fed by something
592665
// other than a swap instruction.
593666
} else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
594667
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
@@ -670,7 +743,8 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() {
670743
// The identified swap entry requires special handling to allow its
671744
// containing computation to be optimized. Perform that handling
672745
// here.
673-
// FIXME: This code is to be phased in with subsequent patches.
746+
// FIXME: Additional opportunities will be phased in with subsequent
747+
// patches.
674748
void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
675749
switch (SwapVector[EntryIdx].SpecialHandling) {
676750

@@ -704,6 +778,91 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
704778
break;
705779
}
706780

781+
// For an XXPERMDI that isn't handled otherwise, we need to
782+
// reverse the order of the operands. If the selector operand
783+
// has a value of 0 or 3, we need to change it to 3 or 0,
784+
// respectively. Otherwise we should leave it alone. (This
785+
// is equivalent to reversing the two bits of the selector
786+
// operand and complementing the result.)
787+
case SHValues::SH_XXPERMDI: {
788+
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
789+
790+
DEBUG(dbgs() << "Changing XXPERMDI: ");
791+
DEBUG(MI->dump());
792+
793+
unsigned Selector = MI->getOperand(3).getImm();
794+
if (Selector == 0 || Selector == 3)
795+
Selector = 3 - Selector;
796+
MI->getOperand(3).setImm(Selector);
797+
798+
unsigned Reg1 = MI->getOperand(1).getReg();
799+
unsigned Reg2 = MI->getOperand(2).getReg();
800+
MI->getOperand(1).setReg(Reg2);
801+
MI->getOperand(2).setReg(Reg1);
802+
803+
DEBUG(dbgs() << " Into: ");
804+
DEBUG(MI->dump());
805+
break;
806+
}
807+
808+
// For a copy from a scalar floating-point register to a vector
809+
// register, removing swaps will leave the copied value in the
810+
// wrong lane. Insert a swap following the copy to fix this.
811+
case SHValues::SH_COPYSCALAR: {
812+
MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
813+
814+
DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
815+
DEBUG(MI->dump());
816+
817+
unsigned DstReg = MI->getOperand(0).getReg();
818+
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
819+
unsigned NewVReg = MRI->createVirtualRegister(DstRC);
820+
821+
MI->getOperand(0).setReg(NewVReg);
822+
DEBUG(dbgs() << " Into: ");
823+
DEBUG(MI->dump());
824+
825+
MachineBasicBlock::iterator InsertPoint = MI->getNextNode();
826+
827+
// Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
828+
// is copying to a VRRC, we need to be careful to avoid a register
829+
// assignment problem. In this case we must copy from VRRC to VSRC
830+
// prior to the swap, and from VSRC to VRRC following the swap.
831+
// Coalescing will usually remove all this mess.
832+
833+
if (DstRC == &PPC::VRRCRegClass) {
834+
unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
835+
unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
836+
837+
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
838+
TII->get(PPC::COPY), VSRCTmp1)
839+
.addReg(NewVReg);
840+
DEBUG(MI->getNextNode()->dump());
841+
842+
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
843+
TII->get(PPC::XXPERMDI), VSRCTmp2)
844+
.addReg(VSRCTmp1)
845+
.addReg(VSRCTmp1)
846+
.addImm(2);
847+
DEBUG(MI->getNextNode()->getNextNode()->dump());
848+
849+
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
850+
TII->get(PPC::COPY), DstReg)
851+
.addReg(VSRCTmp2);
852+
DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump());
853+
854+
} else {
855+
856+
BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
857+
TII->get(PPC::XXPERMDI), DstReg)
858+
.addReg(NewVReg)
859+
.addReg(NewVReg)
860+
.addImm(2);
861+
862+
DEBUG(MI->getNextNode()->dump());
863+
}
864+
break;
865+
}
707866
}
708867
}
709868

@@ -756,6 +915,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
756915
DEBUG(dbgs() << "swap ");
757916
if (SwapVector[EntryIdx].MentionsPhysVR)
758917
DEBUG(dbgs() << "physreg ");
918+
if (SwapVector[EntryIdx].MentionsPartialVR)
919+
DEBUG(dbgs() << "partialreg ");
759920

760921
if (SwapVector[EntryIdx].IsSwappable) {
761922
DEBUG(dbgs() << "swappable ");
@@ -780,6 +941,12 @@ void PPCVSXSwapRemoval::dumpSwapVector() {
780941
case SH_SPLAT:
781942
DEBUG(dbgs() << "special:splat ");
782943
break;
944+
case SH_XXPERMDI:
945+
DEBUG(dbgs() << "special:xxpermdi ");
946+
break;
947+
case SH_COPYSCALAR:
948+
DEBUG(dbgs() << "special:copyscalar ");
949+
break;
783950
}
784951
}
785952

test/CodeGen/PowerPC/swaps-le-3.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ entry:
1717
}
1818

1919
; CHECK-LABEL: @test
20-
; CHECK: xxspltd
21-
; CHECK: lxvd2x
20+
; CHECK-DAG: xxspltd
21+
; CHECK-DAG: lxvd2x
2222
; CHECK: xvadddp
2323
; CHECK: stxvd2x
2424
; CHECK-NOT: xxswapd

0 commit comments

Comments
 (0)