Skip to content

Commit cf5845d

Browse files
committed
[AMDGPU] Use multi-dword flat scratch for spilling
Differential Revision: https://reviews.llvm.org/D93067
1 parent 19d57b5 commit cf5845d

File tree

6 files changed

+624
-288
lines changed

6 files changed

+624
-288
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 156 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,41 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
745745
return true;
746746
}
747747

748+
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
749+
unsigned LoadStoreOp,
750+
unsigned EltSize) {
751+
bool IsStore = TII->get(LoadStoreOp).mayStore();
752+
bool UseST =
753+
AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
754+
AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0;
755+
756+
switch (EltSize) {
757+
case 4:
758+
LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
759+
: AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
760+
break;
761+
case 8:
762+
LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
763+
: AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
764+
break;
765+
case 12:
766+
LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
767+
: AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
768+
break;
769+
case 16:
770+
LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
771+
: AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
772+
break;
773+
default:
774+
llvm_unreachable("Unexpected spill load/store size!");
775+
}
776+
777+
if (UseST)
778+
LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
779+
780+
return LoadStoreOp;
781+
}
782+
748783
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
749784
unsigned LoadStoreOp,
750785
int Index,
@@ -768,18 +803,31 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
768803
bool Scavenged = false;
769804
MCRegister SOffset = ScratchOffsetReg;
770805

771-
const unsigned EltSize = 4;
772806
const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
773-
unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
807+
const bool IsAGPR = hasAGPRs(RC);
808+
const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
809+
810+
// Always use 4 byte operations for AGPRs because we need to scavenge
811+
// a temporary VGPR.
812+
unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
813+
unsigned NumSubRegs = RegWidth / EltSize;
774814
unsigned Size = NumSubRegs * EltSize;
815+
unsigned RemSize = RegWidth - Size;
816+
unsigned NumRemSubRegs = RemSize ? 1 : 0;
775817
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
776-
int64_t MaxOffset = Offset + Size - EltSize;
818+
int64_t MaxOffset = Offset + Size + RemSize - EltSize;
777819
int64_t ScratchOffsetRegDelta = 0;
778820

821+
if (IsFlat && EltSize > 4) {
822+
LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
823+
Desc = &TII->get(LoadStoreOp);
824+
}
825+
779826
Align Alignment = MFI.getObjectAlign(Index);
780827
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
781828

782-
assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
829+
assert((IsFlat || ((Offset % EltSize) == 0)) &&
830+
"unexpected VGPR spill offset");
783831

784832
bool IsOffsetLegal = IsFlat
785833
? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
@@ -840,12 +888,19 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
840888

841889
Register TmpReg;
842890

843-
// FIXME: Flat scratch does not have to be limited to a dword per store.
844-
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
845-
Register SubReg =
846-
NumSubRegs == 1
891+
for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
892+
++i, RegOffset += EltSize) {
893+
if (i == NumSubRegs) {
894+
EltSize = RemSize;
895+
LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
896+
}
897+
Desc = &TII->get(LoadStoreOp);
898+
899+
unsigned NumRegs = EltSize / 4;
900+
Register SubReg = e == 1
847901
? ValueReg
848-
: Register(getSubReg(ValueReg, getSubRegFromChannel(i)));
902+
: Register(getSubReg(ValueReg,
903+
getSubRegFromChannel(RegOffset / 4, NumRegs)));
849904

850905
unsigned SOffsetRegState = 0;
851906
unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -857,75 +912,110 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
857912

858913
// Make sure the whole register is defined if there are undef components by
859914
// adding an implicit def of the super-reg on the first instruction.
860-
const bool NeedSuperRegDef = NumSubRegs > 1 && IsStore && i == 0;
915+
bool NeedSuperRegDef = e > 1 && IsStore && i == 0;
916+
bool NeedSuperRegImpOperand = e > 1;
917+
918+
unsigned Lane = RegOffset / 4;
919+
unsigned LaneE = (RegOffset + EltSize) / 4;
920+
for ( ; Lane != LaneE; ++Lane) {
921+
bool IsSubReg = e > 1 || EltSize > 4;
922+
Register Sub = IsSubReg
923+
? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
924+
: ValueReg;
925+
auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill);
926+
if (!MIB.getInstr())
927+
break;
928+
if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
929+
MIB.addReg(ValueReg, RegState::ImplicitDefine);
930+
NeedSuperRegDef = false;
931+
}
932+
if (IsSubReg || NeedSuperRegImpOperand) {
933+
NeedSuperRegImpOperand = true;
934+
unsigned State = SrcDstRegState;
935+
if (Lane + 1 != LaneE)
936+
State &= ~RegState::Kill;
937+
MIB.addReg(ValueReg, RegState::Implicit | State);
938+
}
939+
}
861940

862-
auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
941+
if (Lane == LaneE) // Fully spilled into AGPRs.
942+
continue;
943+
944+
// Offset in bytes from the beginning of the ValueReg to its portion we
945+
// still need to spill. It may differ from RegOffset if a portion of
946+
// current SubReg has been already spilled into AGPRs by the loop above.
947+
unsigned RemRegOffset = Lane * 4;
948+
unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset);
949+
if (RemEltSize != EltSize) { // Partially spilled to AGPRs
950+
assert(IsFlat && EltSize > 4);
951+
952+
unsigned NumRegs = RemEltSize / 4;
953+
SubReg = Register(getSubReg(ValueReg,
954+
getSubRegFromChannel(RemRegOffset / 4, NumRegs)));
955+
unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
956+
Desc = &TII->get(Opc);
957+
}
863958

864-
if (!MIB.getInstr()) {
865-
unsigned FinalReg = SubReg;
959+
unsigned FinalReg = SubReg;
866960

867-
const bool IsAGPR = hasAGPRs(RC);
868-
if (IsAGPR) {
869-
if (!TmpReg) {
870-
assert(RS && "Needs to have RegScavenger to spill an AGPR!");
871-
// FIXME: change to scavengeRegisterBackwards()
872-
TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
873-
RS->setRegUsed(TmpReg);
874-
}
875-
if (IsStore) {
876-
auto AccRead = BuildMI(*MBB, MI, DL,
877-
TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
878-
.addReg(SubReg, getKillRegState(IsKill));
879-
if (NeedSuperRegDef)
880-
AccRead.addReg(ValueReg, RegState::ImplicitDefine);
881-
AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
882-
}
883-
SubReg = TmpReg;
961+
if (IsAGPR) {
962+
assert(EltSize == 4);
963+
964+
if (!TmpReg) {
965+
assert(RS && "Needs to have RegScavenger to spill an AGPR!");
966+
// FIXME: change to scavengeRegisterBackwards()
967+
TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
968+
RS->setRegUsed(TmpReg);
969+
}
970+
if (IsStore) {
971+
auto AccRead = BuildMI(*MBB, MI, DL,
972+
TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
973+
.addReg(SubReg, getKillRegState(IsKill));
974+
if (NeedSuperRegDef)
975+
AccRead.addReg(ValueReg, RegState::ImplicitDefine);
976+
AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
884977
}
978+
SubReg = TmpReg;
979+
}
885980

886-
MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
887-
MachineMemOperand *NewMMO =
888-
MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize,
889-
commonAlignment(Alignment, EltSize * i));
981+
MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset);
982+
MachineMemOperand *NewMMO =
983+
MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
984+
commonAlignment(Alignment, RemRegOffset));
890985

891-
MIB = BuildMI(*MBB, MI, DL, *Desc)
892-
.addReg(SubReg,
893-
getDefRegState(!IsStore) | getKillRegState(IsKill));
894-
if (!IsFlat)
895-
MIB.addReg(FuncInfo->getScratchRSrcReg());
986+
auto MIB = BuildMI(*MBB, MI, DL, *Desc)
987+
.addReg(SubReg,
988+
getDefRegState(!IsStore) | getKillRegState(IsKill));
989+
if (!IsFlat)
990+
MIB.addReg(FuncInfo->getScratchRSrcReg());
896991

897-
if (SOffset == AMDGPU::NoRegister) {
898-
if (!IsFlat)
899-
MIB.addImm(0);
900-
} else {
901-
MIB.addReg(SOffset, SOffsetRegState);
902-
}
903-
MIB.addImm(Offset)
904-
.addImm(0) // glc
905-
.addImm(0) // slc
906-
.addImm(0); // tfe for MUBUF or dlc for FLAT
992+
if (SOffset == AMDGPU::NoRegister) {
907993
if (!IsFlat)
908-
MIB.addImm(0) // dlc
909-
.addImm(0); // swz
910-
MIB.addMemOperand(NewMMO);
911-
912-
if (!IsAGPR && NeedSuperRegDef)
913-
MIB.addReg(ValueReg, RegState::ImplicitDefine);
914-
915-
if (!IsStore && TmpReg != AMDGPU::NoRegister) {
916-
MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
917-
FinalReg)
918-
.addReg(TmpReg, RegState::Kill);
919-
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
920-
}
994+
MIB.addImm(0);
921995
} else {
922-
if (NeedSuperRegDef)
923-
MIB.addReg(ValueReg, RegState::ImplicitDefine);
996+
MIB.addReg(SOffset, SOffsetRegState);
997+
}
998+
MIB.addImm(Offset + RemRegOffset)
999+
.addImm(0) // glc
1000+
.addImm(0) // slc
1001+
.addImm(0); // tfe for MUBUF or dlc for FLAT
1002+
if (!IsFlat)
1003+
MIB.addImm(0) // dlc
1004+
.addImm(0); // swz
1005+
MIB.addMemOperand(NewMMO);
1006+
1007+
if (!IsAGPR && NeedSuperRegDef)
1008+
MIB.addReg(ValueReg, RegState::ImplicitDefine);
1009+
1010+
if (!IsStore && TmpReg != AMDGPU::NoRegister) {
1011+
MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
1012+
FinalReg)
1013+
.addReg(TmpReg, RegState::Kill);
1014+
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
9241015
}
9251016

926-
if (NumSubRegs > 1) {
1017+
if (NeedSuperRegImpOperand)
9271018
MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
928-
}
9291019
}
9301020

9311021
if (ScratchOffsetRegDelta != 0) {

0 commit comments

Comments
 (0)