@@ -745,6 +745,41 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
745
745
return true ;
746
746
}
747
747
748
+ static unsigned getFlatScratchSpillOpcode (const SIInstrInfo *TII,
749
+ unsigned LoadStoreOp,
750
+ unsigned EltSize) {
751
+ bool IsStore = TII->get (LoadStoreOp).mayStore ();
752
+ bool UseST =
753
+ AMDGPU::getNamedOperandIdx (LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
754
+ AMDGPU::getNamedOperandIdx (LoadStoreOp, AMDGPU::OpName::saddr) < 0 ;
755
+
756
+ switch (EltSize) {
757
+ case 4 :
758
+ LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
759
+ : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
760
+ break ;
761
+ case 8 :
762
+ LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
763
+ : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
764
+ break ;
765
+ case 12 :
766
+ LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
767
+ : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
768
+ break ;
769
+ case 16 :
770
+ LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
771
+ : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
772
+ break ;
773
+ default :
774
+ llvm_unreachable (" Unexpected spill load/store size!" );
775
+ }
776
+
777
+ if (UseST)
778
+ LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS (LoadStoreOp);
779
+
780
+ return LoadStoreOp;
781
+ }
782
+
748
783
void SIRegisterInfo::buildSpillLoadStore (MachineBasicBlock::iterator MI,
749
784
unsigned LoadStoreOp,
750
785
int Index,
@@ -768,18 +803,31 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
768
803
bool Scavenged = false ;
769
804
MCRegister SOffset = ScratchOffsetReg;
770
805
771
- const unsigned EltSize = 4 ;
772
806
const TargetRegisterClass *RC = getRegClassForReg (MF->getRegInfo (), ValueReg);
773
- unsigned NumSubRegs = AMDGPU::getRegBitWidth (RC->getID ()) / (EltSize * CHAR_BIT);
807
+ const bool IsAGPR = hasAGPRs (RC);
808
+ const unsigned RegWidth = AMDGPU::getRegBitWidth (RC->getID ()) / 8 ;
809
+
810
+ // Always use 4 byte operations for AGPRs because we need to scavenge
811
+ // a temporary VGPR.
812
+ unsigned EltSize = (IsFlat && !IsAGPR) ? std::min (RegWidth, 16u ) : 4u ;
813
+ unsigned NumSubRegs = RegWidth / EltSize;
774
814
unsigned Size = NumSubRegs * EltSize;
815
+ unsigned RemSize = RegWidth - Size;
816
+ unsigned NumRemSubRegs = RemSize ? 1 : 0 ;
775
817
int64_t Offset = InstOffset + MFI.getObjectOffset (Index);
776
- int64_t MaxOffset = Offset + Size - EltSize;
818
+ int64_t MaxOffset = Offset + Size + RemSize - EltSize;
777
819
int64_t ScratchOffsetRegDelta = 0 ;
778
820
821
+ if (IsFlat && EltSize > 4 ) {
822
+ LoadStoreOp = getFlatScratchSpillOpcode (TII, LoadStoreOp, EltSize);
823
+ Desc = &TII->get (LoadStoreOp);
824
+ }
825
+
779
826
Align Alignment = MFI.getObjectAlign (Index);
780
827
const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo ();
781
828
782
- assert ((Offset % EltSize) == 0 && " unexpected VGPR spill offset" );
829
+ assert ((IsFlat || ((Offset % EltSize) == 0 )) &&
830
+ " unexpected VGPR spill offset" );
783
831
784
832
bool IsOffsetLegal = IsFlat
785
833
? TII->isLegalFLATOffset (MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true )
@@ -840,12 +888,19 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
840
888
841
889
Register TmpReg;
842
890
843
- // FIXME: Flat scratch does not have to be limited to a dword per store.
844
- for (unsigned i = 0 , e = NumSubRegs; i != e; ++i, Offset += EltSize) {
845
- Register SubReg =
846
- NumSubRegs == 1
891
+ for (unsigned i = 0 , e = NumSubRegs + NumRemSubRegs, RegOffset = 0 ; i != e;
892
+ ++i, RegOffset += EltSize) {
893
+ if (i == NumSubRegs) {
894
+ EltSize = RemSize;
895
+ LoadStoreOp = getFlatScratchSpillOpcode (TII, LoadStoreOp, EltSize);
896
+ }
897
+ Desc = &TII->get (LoadStoreOp);
898
+
899
+ unsigned NumRegs = EltSize / 4 ;
900
+ Register SubReg = e == 1
847
901
? ValueReg
848
- : Register (getSubReg (ValueReg, getSubRegFromChannel (i)));
902
+ : Register (getSubReg (ValueReg,
903
+ getSubRegFromChannel (RegOffset / 4 , NumRegs)));
849
904
850
905
unsigned SOffsetRegState = 0 ;
851
906
unsigned SrcDstRegState = getDefRegState (!IsStore);
@@ -857,75 +912,110 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
857
912
858
913
// Make sure the whole register is defined if there are undef components by
859
914
// adding an implicit def of the super-reg on the first instruction.
860
- const bool NeedSuperRegDef = NumSubRegs > 1 && IsStore && i == 0 ;
915
+ bool NeedSuperRegDef = e > 1 && IsStore && i == 0 ;
916
+ bool NeedSuperRegImpOperand = e > 1 ;
917
+
918
+ unsigned Lane = RegOffset / 4 ;
919
+ unsigned LaneE = (RegOffset + EltSize) / 4 ;
920
+ for ( ; Lane != LaneE; ++Lane) {
921
+ bool IsSubReg = e > 1 || EltSize > 4 ;
922
+ Register Sub = IsSubReg
923
+ ? Register (getSubReg (ValueReg, getSubRegFromChannel (Lane)))
924
+ : ValueReg;
925
+ auto MIB = spillVGPRtoAGPR (ST, MI, Index, Lane, Sub, IsKill);
926
+ if (!MIB.getInstr ())
927
+ break ;
928
+ if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0 )) {
929
+ MIB.addReg (ValueReg, RegState::ImplicitDefine);
930
+ NeedSuperRegDef = false ;
931
+ }
932
+ if (IsSubReg || NeedSuperRegImpOperand) {
933
+ NeedSuperRegImpOperand = true ;
934
+ unsigned State = SrcDstRegState;
935
+ if (Lane + 1 != LaneE)
936
+ State &= ~RegState::Kill;
937
+ MIB.addReg (ValueReg, RegState::Implicit | State);
938
+ }
939
+ }
861
940
862
- auto MIB = spillVGPRtoAGPR (ST, MI, Index, i, SubReg, IsKill);
941
+ if (Lane == LaneE) // Fully spilled into AGPRs.
942
+ continue ;
943
+
944
+ // Offset in bytes from the beginning of the ValueReg to its portion we
945
+ // still need to spill. It may differ from RegOffset if a portion of
946
+ // current SubReg has been already spilled into AGPRs by the loop above.
947
+ unsigned RemRegOffset = Lane * 4 ;
948
+ unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset);
949
+ if (RemEltSize != EltSize) { // Partially spilled to AGPRs
950
+ assert (IsFlat && EltSize > 4 );
951
+
952
+ unsigned NumRegs = RemEltSize / 4 ;
953
+ SubReg = Register (getSubReg (ValueReg,
954
+ getSubRegFromChannel (RemRegOffset / 4 , NumRegs)));
955
+ unsigned Opc = getFlatScratchSpillOpcode (TII, LoadStoreOp, RemEltSize);
956
+ Desc = &TII->get (Opc);
957
+ }
863
958
864
- if (!MIB.getInstr ()) {
865
- unsigned FinalReg = SubReg;
959
+ unsigned FinalReg = SubReg;
866
960
867
- const bool IsAGPR = hasAGPRs (RC);
868
- if (IsAGPR) {
869
- if (!TmpReg) {
870
- assert (RS && " Needs to have RegScavenger to spill an AGPR!" );
871
- // FIXME: change to scavengeRegisterBackwards()
872
- TmpReg = RS->scavengeRegister (&AMDGPU::VGPR_32RegClass, MI, 0 );
873
- RS->setRegUsed (TmpReg);
874
- }
875
- if (IsStore) {
876
- auto AccRead = BuildMI (*MBB, MI, DL,
877
- TII->get (AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
878
- .addReg (SubReg, getKillRegState (IsKill));
879
- if (NeedSuperRegDef)
880
- AccRead.addReg (ValueReg, RegState::ImplicitDefine);
881
- AccRead->setAsmPrinterFlag (MachineInstr::ReloadReuse);
882
- }
883
- SubReg = TmpReg;
961
+ if (IsAGPR) {
962
+ assert (EltSize == 4 );
963
+
964
+ if (!TmpReg) {
965
+ assert (RS && " Needs to have RegScavenger to spill an AGPR!" );
966
+ // FIXME: change to scavengeRegisterBackwards()
967
+ TmpReg = RS->scavengeRegister (&AMDGPU::VGPR_32RegClass, MI, 0 );
968
+ RS->setRegUsed (TmpReg);
969
+ }
970
+ if (IsStore) {
971
+ auto AccRead = BuildMI (*MBB, MI, DL,
972
+ TII->get (AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
973
+ .addReg (SubReg, getKillRegState (IsKill));
974
+ if (NeedSuperRegDef)
975
+ AccRead.addReg (ValueReg, RegState::ImplicitDefine);
976
+ AccRead->setAsmPrinterFlag (MachineInstr::ReloadReuse);
884
977
}
978
+ SubReg = TmpReg;
979
+ }
885
980
886
- MachinePointerInfo PInfo = BasePtrInfo.getWithOffset (EltSize * i );
887
- MachineMemOperand *NewMMO =
888
- MF->getMachineMemOperand (PInfo, MMO->getFlags (), EltSize ,
889
- commonAlignment (Alignment, EltSize * i ));
981
+ MachinePointerInfo PInfo = BasePtrInfo.getWithOffset (RemRegOffset );
982
+ MachineMemOperand *NewMMO =
983
+ MF->getMachineMemOperand (PInfo, MMO->getFlags (), RemEltSize ,
984
+ commonAlignment (Alignment, RemRegOffset ));
890
985
891
- MIB = BuildMI (*MBB, MI, DL, *Desc)
892
- .addReg (SubReg,
893
- getDefRegState (!IsStore) | getKillRegState (IsKill));
894
- if (!IsFlat)
895
- MIB.addReg (FuncInfo->getScratchRSrcReg ());
986
+ auto MIB = BuildMI (*MBB, MI, DL, *Desc)
987
+ .addReg (SubReg,
988
+ getDefRegState (!IsStore) | getKillRegState (IsKill));
989
+ if (!IsFlat)
990
+ MIB.addReg (FuncInfo->getScratchRSrcReg ());
896
991
897
- if (SOffset == AMDGPU::NoRegister) {
898
- if (!IsFlat)
899
- MIB.addImm (0 );
900
- } else {
901
- MIB.addReg (SOffset, SOffsetRegState);
902
- }
903
- MIB.addImm (Offset)
904
- .addImm (0 ) // glc
905
- .addImm (0 ) // slc
906
- .addImm (0 ); // tfe for MUBUF or dlc for FLAT
992
+ if (SOffset == AMDGPU::NoRegister) {
907
993
if (!IsFlat)
908
- MIB.addImm (0 ) // dlc
909
- .addImm (0 ); // swz
910
- MIB.addMemOperand (NewMMO);
911
-
912
- if (!IsAGPR && NeedSuperRegDef)
913
- MIB.addReg (ValueReg, RegState::ImplicitDefine);
914
-
915
- if (!IsStore && TmpReg != AMDGPU::NoRegister) {
916
- MIB = BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_ACCVGPR_WRITE_B32),
917
- FinalReg)
918
- .addReg (TmpReg, RegState::Kill);
919
- MIB->setAsmPrinterFlag (MachineInstr::ReloadReuse);
920
- }
994
+ MIB.addImm (0 );
921
995
} else {
922
- if (NeedSuperRegDef)
923
- MIB.addReg (ValueReg, RegState::ImplicitDefine);
996
+ MIB.addReg (SOffset, SOffsetRegState);
997
+ }
998
+ MIB.addImm (Offset + RemRegOffset)
999
+ .addImm (0 ) // glc
1000
+ .addImm (0 ) // slc
1001
+ .addImm (0 ); // tfe for MUBUF or dlc for FLAT
1002
+ if (!IsFlat)
1003
+ MIB.addImm (0 ) // dlc
1004
+ .addImm (0 ); // swz
1005
+ MIB.addMemOperand (NewMMO);
1006
+
1007
+ if (!IsAGPR && NeedSuperRegDef)
1008
+ MIB.addReg (ValueReg, RegState::ImplicitDefine);
1009
+
1010
+ if (!IsStore && TmpReg != AMDGPU::NoRegister) {
1011
+ MIB = BuildMI (*MBB, MI, DL, TII->get (AMDGPU::V_ACCVGPR_WRITE_B32),
1012
+ FinalReg)
1013
+ .addReg (TmpReg, RegState::Kill);
1014
+ MIB->setAsmPrinterFlag (MachineInstr::ReloadReuse);
924
1015
}
925
1016
926
- if (NumSubRegs > 1 ) {
1017
+ if (NeedSuperRegImpOperand)
927
1018
MIB.addReg (ValueReg, RegState::Implicit | SrcDstRegState);
928
- }
929
1019
}
930
1020
931
1021
if (ScratchOffsetRegDelta != 0 ) {
0 commit comments