Skip to content

Commit fd82cbc

Browse files
committed
GlobalISel: Merge and cleanup more AMDGPU call lowering code
This merges more AMDGPU ABI lowering code into the generic call lowering. Start cleaning up by factoring away more of the pack/unpack logic into the buildCopy{To|From}Parts functions. These could use more improvement, and the SelectionDAG versions are significantly more complex, and we'll eventually have to emulate all of those cases too. This is mostly NFC, but does result in some minor instruction reordering. It also removes some of the limitations with mismatched sizes the old code had. However, similarly to the merge on the input, this is forcing gfx6/gfx7 to use the gfx8+ ABI (which is what we actually want, but SelectionDAG is stuck using the weird emergent ABI). This also changes the load/store size for stack passed EVTs for AArch64, which makes it consistent with the DAG behavior.
1 parent 14ccba2 commit fd82cbc

30 files changed

+764
-667
lines changed

llvm/lib/CodeGen/GlobalISel/CallLowering.cpp

Lines changed: 150 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -256,16 +256,32 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
256256
return B.buildConcatVectors(DstRegs[0], SrcRegs);
257257
}
258258

259-
const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
260-
Register Undef = B.buildUndef(PartLLT).getReg(0);
261-
262-
// Build vector of undefs.
263-
SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
264-
265-
// Replace the first sources with the real registers.
266-
std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
259+
// We need to create an unmerge to the result registers, which may require
260+
// widening the original value.
261+
Register UnmergeSrcReg;
262+
if (LCMTy != PartLLT) {
263+
// e.g. A <3 x s16> value was split to <2 x s16>
264+
// %register_value0:_(<2 x s16>)
265+
// %register_value1:_(<2 x s16>)
266+
// %undef:_(<2 x s16>) = G_IMPLICIT_DEF
267+
// %concat:_<6 x s16>) = G_CONCAT_VECTORS %reg_value0, %reg_value1, %undef
268+
// %dst_reg:_(<3 x s16>), %dead:_(<3 x s16>) = G_UNMERGE_VALUES %concat
269+
const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
270+
Register Undef = B.buildUndef(PartLLT).getReg(0);
271+
272+
// Build vector of undefs.
273+
SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
274+
275+
// Replace the first sources with the real registers.
276+
std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
277+
UnmergeSrcReg = B.buildConcatVectors(LCMTy, WidenedSrcs).getReg(0);
278+
} else {
279+
// We don't need to widen anything if we're extracting a scalar which was
280+
// promoted to a vector e.g. s8 -> v4s8 -> s8
281+
assert(SrcRegs.size() == 1);
282+
UnmergeSrcReg = SrcRegs[0];
283+
}
267284

268-
auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
269285
int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
270286

271287
SmallVector<Register, 8> PadDstRegs(NumDst);
@@ -275,17 +291,27 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
275291
for (int I = DstRegs.size(); I != NumDst; ++I)
276292
PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
277293

278-
return B.buildUnmerge(PadDstRegs, Widened);
294+
return B.buildUnmerge(PadDstRegs, UnmergeSrcReg);
279295
}
280296

281297
/// Create a sequence of instructions to combine pieces split into register
282298
/// typed values to the original IR value. \p OrigRegs contains the destination
283299
/// value registers of type \p LLTy, and \p Regs contains the legalized pieces
284-
/// with type \p PartLLT.
285-
static void buildCopyToParts(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
286-
ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) {
300+
/// with type \p PartLLT. This is used for incoming values (physregs to vregs).
301+
static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
302+
ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) {
287303
MachineRegisterInfo &MRI = *B.getMRI();
288304

305+
// We could just insert a regular copy, but this is unreachable at the moment.
306+
assert(LLTy != PartLLT && "identical part types shouldn't reach here");
307+
308+
if (PartLLT.isVector() == LLTy.isVector() &&
309+
PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits()) {
310+
assert(OrigRegs.size() == 1 && Regs.size() == 1);
311+
B.buildTrunc(OrigRegs[0], Regs[0]);
312+
return;
313+
}
314+
289315
if (!LLTy.isVector() && !PartLLT.isVector()) {
290316
assert(OrigRegs.size() == 1);
291317
LLT OrigTy = MRI.getType(OrigRegs[0]);
@@ -301,9 +327,9 @@ static void buildCopyToParts(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
301327
return;
302328
}
303329

304-
if (LLTy.isVector() && PartLLT.isVector()) {
305-
assert(OrigRegs.size() == 1);
306-
assert(LLTy.getElementType() == PartLLT.getElementType());
330+
if (PartLLT.isVector()) {
331+
assert(OrigRegs.size() == 1 &&
332+
LLTy.getScalarType() == PartLLT.getElementType());
307333
mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
308334
return;
309335
}
@@ -353,6 +379,71 @@ static void buildCopyToParts(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
353379
}
354380
}
355381

382+
/// Create a sequence of instructions to expand the value in \p SrcReg (of type
383+
/// \p SrcTy) to the types in \p DstRegs (of type \p PartTy). \p ExtendOp should
384+
/// contain the type of scalar value extension if necessary.
385+
///
386+
/// This is used for outgoing values (vregs to physregs)
387+
static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
388+
Register SrcReg, LLT SrcTy, LLT PartTy,
389+
unsigned ExtendOp = TargetOpcode::G_ANYEXT) {
390+
// We could just insert a regular copy, but this is unreachable at the moment.
391+
assert(SrcTy != PartTy && "identical part types shouldn't reach here");
392+
393+
const unsigned PartSize = PartTy.getSizeInBits();
394+
395+
if (PartTy.isVector() == SrcTy.isVector() &&
396+
PartTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits()) {
397+
assert(DstRegs.size() == 1);
398+
B.buildInstr(ExtendOp, {DstRegs[0]}, {SrcReg});
399+
return;
400+
}
401+
402+
if (SrcTy.isVector() && !PartTy.isVector() &&
403+
PartSize > SrcTy.getElementType().getSizeInBits()) {
404+
// Vector was scalarized, and the elements extended.
405+
auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
406+
for (int i = 0, e = DstRegs.size(); i != e; ++i)
407+
B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
408+
return;
409+
}
410+
411+
LLT GCDTy = getGCDType(SrcTy, PartTy);
412+
if (GCDTy == PartTy) {
413+
// If this already evenly divisible, we can create a simple unmerge.
414+
B.buildUnmerge(DstRegs, SrcReg);
415+
return;
416+
}
417+
418+
MachineRegisterInfo &MRI = *B.getMRI();
419+
LLT DstTy = MRI.getType(DstRegs[0]);
420+
LLT LCMTy = getLCMType(SrcTy, PartTy);
421+
422+
const unsigned LCMSize = LCMTy.getSizeInBits();
423+
const unsigned DstSize = DstTy.getSizeInBits();
424+
const unsigned SrcSize = SrcTy.getSizeInBits();
425+
426+
Register UnmergeSrc = SrcReg;
427+
if (LCMSize != SrcSize) {
428+
// Widen to the common type.
429+
Register Undef = B.buildUndef(SrcTy).getReg(0);
430+
SmallVector<Register, 8> MergeParts(1, SrcReg);
431+
for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize)
432+
MergeParts.push_back(Undef);
433+
434+
UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0);
435+
}
436+
437+
// Unmerge to the original registers and pad with dead defs.
438+
SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end());
439+
for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize;
440+
Size += DstSize) {
441+
UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy));
442+
}
443+
444+
B.buildUnmerge(UnmergeResults, UnmergeSrc);
445+
}
446+
356447
bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
357448
SmallVectorImpl<ArgInfo> &Args,
358449
ValueHandler &Handler,
@@ -367,13 +458,22 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
367458
ThisReturnReg);
368459
}
369460

461+
static unsigned extendOpFromFlags(llvm::ISD::ArgFlagsTy Flags) {
462+
if (Flags.isSExt())
463+
return TargetOpcode::G_SEXT;
464+
if (Flags.isZExt())
465+
return TargetOpcode::G_ZEXT;
466+
return TargetOpcode::G_ANYEXT;
467+
}
468+
370469
bool CallLowering::handleAssignments(CCState &CCInfo,
371470
SmallVectorImpl<CCValAssign> &ArgLocs,
372471
MachineIRBuilder &MIRBuilder,
373472
SmallVectorImpl<ArgInfo> &Args,
374473
ValueHandler &Handler,
375474
Register ThisReturnReg) const {
376475
MachineFunction &MF = MIRBuilder.getMF();
476+
MachineRegisterInfo &MRI = MF.getRegInfo();
377477
const Function &F = MF.getFunction();
378478
const DataLayout &DL = F.getParent()->getDataLayout();
379479

@@ -399,10 +499,20 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
399499
if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
400500
Args[i].Flags[0], CCInfo))
401501
return false;
502+
503+
// If we couldn't directly assign this part, some casting may be
504+
// necessary. Create the new register, but defer inserting the conversion
505+
// instructions.
506+
assert(Args[i].OrigRegs.empty());
507+
Args[i].OrigRegs.push_back(Args[i].Regs[0]);
508+
assert(Args[i].Regs.size() == 1);
509+
510+
const LLT VATy(NewVT);
511+
Args[i].Regs[0] = MRI.createGenericVirtualRegister(VATy);
402512
continue;
403513
}
404514

405-
assert(NumParts > 1);
515+
const LLT NewLLT(NewVT);
406516

407517
// For incoming arguments (physregs to vregs), we could have values in
408518
// physregs (or memlocs) which we want to extract and copy to vregs.
@@ -419,13 +529,11 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
419529
Args[i].OrigRegs.push_back(Args[i].Regs[0]);
420530
Args[i].Regs.clear();
421531
Args[i].Flags.clear();
422-
LLT NewLLT = getLLTForMVT(NewVT);
423532
// For each split register, create and assign a vreg that will store
424533
// the incoming component of the larger value. These will later be
425534
// merged to form the final vreg.
426535
for (unsigned Part = 0; Part < NumParts; ++Part) {
427-
Register Reg =
428-
MIRBuilder.getMRI()->createGenericVirtualRegister(NewLLT);
536+
Register Reg = MRI.createGenericVirtualRegister(NewLLT);
429537
ISD::ArgFlagsTy Flags = OrigFlags;
430538
if (Part == 0) {
431539
Flags.setSplit();
@@ -443,12 +551,13 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
443551
}
444552
}
445553
} else {
554+
assert(Args[i].Regs.size() == 1);
555+
446556
// This type is passed via multiple registers in the calling convention.
447557
// We need to extract the individual parts.
448-
Register LargeReg = Args[i].Regs[0];
449-
LLT SmallTy = LLT::scalar(NewVT.getSizeInBits());
450-
auto Unmerge = MIRBuilder.buildUnmerge(SmallTy, LargeReg);
451-
assert(Unmerge->getNumOperands() == NumParts + 1);
558+
assert(Args[i].OrigRegs.empty());
559+
Args[i].OrigRegs.push_back(Args[i].Regs[0]);
560+
452561
ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
453562
// We're going to replace the regs and flags with the split ones.
454563
Args[i].Regs.clear();
@@ -471,7 +580,9 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
471580
Flags.setReturned(false);
472581
}
473582

474-
Args[i].Regs.push_back(Unmerge.getReg(PartIdx));
583+
Register NewReg = MRI.createGenericVirtualRegister(NewLLT);
584+
585+
Args[i].Regs.push_back(NewReg);
475586
Args[i].Flags.push_back(Flags);
476587
if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full,
477588
Args[i], Args[i].Flags[PartIdx], CCInfo))
@@ -495,20 +606,25 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
495606
continue;
496607
}
497608

498-
EVT OrigVT = EVT::getEVT(Args[i].Ty);
499609
EVT VAVT = VA.getValVT();
500610
const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
501611
const LLT VATy(VAVT.getSimpleVT());
502612

503613
// Expected to be multiple regs for a single incoming arg.
504614
// There should be Regs.size() ArgLocs per argument.
505615
unsigned NumArgRegs = Args[i].Regs.size();
506-
MachineRegisterInfo &MRI = MF.getRegInfo();
507616
assert((j + (NumArgRegs - 1)) < ArgLocs.size() &&
508617
"Too many regs for number of args");
618+
619+
// Coerce into outgoing value types before register assignment.
620+
if (!Handler.isIncomingArgumentHandler() && OrigTy != VATy) {
621+
assert(Args[i].OrigRegs.size() == 1);
622+
buildCopyToRegs(MIRBuilder, Args[i].Regs, Args[i].OrigRegs[0], OrigTy,
623+
VATy, extendOpFromFlags(Args[i].Flags[0]));
624+
}
625+
509626
for (unsigned Part = 0; Part < NumArgRegs; ++Part) {
510627
Register ArgReg = Args[i].Regs[Part];
511-
LLT ArgRegTy = MRI.getType(ArgReg);
512628
// There should be Regs.size() ArgLocs per argument.
513629
VA = ArgLocs[j + Part];
514630
if (VA.isMemLoc()) {
@@ -536,57 +652,16 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
536652
continue;
537653
}
538654

539-
// GlobalISel does not currently work for scalable vectors.
540-
if (OrigVT.getFixedSizeInBits() >= VAVT.getFixedSizeInBits() ||
541-
!Handler.isIncomingArgumentHandler()) {
542-
// This is an argument that might have been split. There should be
543-
// Regs.size() ArgLocs per argument.
544-
545-
// Insert the argument copies. If VAVT < OrigVT, we'll insert the merge
546-
// to the original register after handling all of the parts.
547-
Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
548-
continue;
549-
}
550-
551-
// This ArgLoc covers multiple pieces, so we need to split it.
552-
Register NewReg = MRI.createGenericVirtualRegister(VATy);
553-
Handler.assignValueToReg(NewReg, VA.getLocReg(), VA);
554-
// If it's a vector type, we either need to truncate the elements
555-
// or do an unmerge to get the lower block of elements.
556-
if (VATy.isVector() &&
557-
VATy.getNumElements() > OrigVT.getVectorNumElements()) {
558-
// Just handle the case where the VA type is a multiple of original
559-
// type.
560-
if (VATy.getNumElements() % OrigVT.getVectorNumElements() != 0) {
561-
LLVM_DEBUG(dbgs() << "Incoming promoted vector arg elts is not a "
562-
"multiple of orig type elt: "
563-
<< VATy << " vs " << OrigTy);
564-
return false;
565-
}
566-
SmallVector<Register, 4> DstRegs = {ArgReg};
567-
unsigned NumParts =
568-
VATy.getNumElements() / OrigVT.getVectorNumElements() - 1;
569-
for (unsigned Idx = 0; Idx < NumParts; ++Idx)
570-
DstRegs.push_back(
571-
MIRBuilder.getMRI()->createGenericVirtualRegister(OrigTy));
572-
MIRBuilder.buildUnmerge(DstRegs, {NewReg});
573-
} else if (VATy.getScalarSizeInBits() > ArgRegTy.getScalarSizeInBits()) {
574-
MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0);
575-
} else {
576-
MIRBuilder.buildCopy(ArgReg, NewReg);
577-
}
655+
Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
578656
}
579657

580-
// Now that all pieces have been handled, re-pack any arguments into any
581-
// wider, original registers.
582-
if (Handler.isIncomingArgumentHandler()) {
658+
// Now that all pieces have been assigned, re-pack the register typed values
659+
// into the original value typed registers.
660+
if (Handler.isIncomingArgumentHandler() && OrigTy != VATy) {
583661
// Merge the split registers into the expected larger result vregs of
584662
// the original call.
585-
586-
if (OrigTy != VATy && !Args[i].OrigRegs.empty()) {
587-
buildCopyToParts(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy,
588-
VATy);
589-
}
663+
buildCopyFromRegs(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy,
664+
VATy);
590665
}
591666

592667
j += NumArgRegs - 1;

0 commit comments

Comments
 (0)