@@ -637,6 +637,54 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
637
637
DefBuilder.addReg (ImpDefSuperReg, RegState::Define | RegState::Implicit);
638
638
}
639
639
640
+ static void expandSGPRCopy (const SIInstrInfo &TII, MachineBasicBlock &MBB,
641
+ MachineBasicBlock::iterator MI, const DebugLoc &DL,
642
+ MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
643
+ const TargetRegisterClass *RC, bool Forward) {
644
+ const SIRegisterInfo &RI = TII.getRegisterInfo ();
645
+ ArrayRef<int16_t > BaseIndices = RI.getRegSplitParts (RC, 4 );
646
+ MachineBasicBlock::iterator I = MI;
647
+ MachineInstr *FirstMI = nullptr , *LastMI = nullptr ;
648
+
649
+ for (unsigned Idx = 0 ; Idx < BaseIndices.size (); ++Idx) {
650
+ int16_t SubIdx = BaseIndices[Idx];
651
+ Register Reg = RI.getSubReg (DestReg, SubIdx);
652
+ unsigned Opcode = AMDGPU::S_MOV_B32;
653
+
654
+ // Is SGPR aligned? If so try to combine with next.
655
+ Register Src = RI.getSubReg (SrcReg, SubIdx);
656
+ bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2 ) == 0 ;
657
+ bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2 ) == 0 ;
658
+ if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size ())) {
659
+ // Can use SGPR64 copy
660
+ unsigned Channel = RI.getChannelFromSubReg (SubIdx);
661
+ SubIdx = RI.getSubRegFromChannel (Channel, 2 );
662
+ Opcode = AMDGPU::S_MOV_B64;
663
+ Idx++;
664
+ }
665
+
666
+ LastMI = BuildMI (MBB, I, DL, TII.get (Opcode), RI.getSubReg (DestReg, SubIdx))
667
+ .addReg (RI.getSubReg (SrcReg, SubIdx))
668
+ .addReg (SrcReg, RegState::Implicit);
669
+
670
+ if (!FirstMI)
671
+ FirstMI = LastMI;
672
+
673
+ if (!Forward)
674
+ I--;
675
+ }
676
+
677
+ assert (FirstMI && LastMI);
678
+ if (!Forward)
679
+ std::swap (FirstMI, LastMI);
680
+
681
+ FirstMI->addOperand (
682
+ MachineOperand::CreateReg (DestReg, true /* IsDef*/ , true /* IsImp*/ ));
683
+
684
+ if (KillSrc)
685
+ LastMI->addRegisterKilled (SrcReg, &RI);
686
+ }
687
+
640
688
void SIInstrInfo::copyPhysReg (MachineBasicBlock &MBB,
641
689
MachineBasicBlock::iterator MI,
642
690
const DebugLoc &DL, MCRegister DestReg,
@@ -842,41 +890,34 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
842
890
return ;
843
891
}
844
892
845
- unsigned EltSize = 4 ;
846
- unsigned Opcode = AMDGPU::V_MOV_B32_e32;
893
+ const bool Forward = RI.getHWRegIndex (DestReg) <= RI.getHWRegIndex (SrcReg);
847
894
if (RI.isSGPRClass (RC)) {
848
- // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
849
- if (!(RI.getRegSizeInBits (*RC) % 64 )) {
850
- Opcode = AMDGPU::S_MOV_B64;
851
- EltSize = 8 ;
852
- } else {
853
- Opcode = AMDGPU::S_MOV_B32;
854
- EltSize = 4 ;
855
- }
856
-
857
895
if (!RI.isSGPRClass (RI.getPhysRegClass (SrcReg))) {
858
896
reportIllegalCopy (this , MBB, MI, DL, DestReg, SrcReg, KillSrc);
859
897
return ;
860
898
}
861
- } else if (RI.hasAGPRs (RC)) {
899
+ expandSGPRCopy (*this , MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
900
+ return ;
901
+ }
902
+
903
+ unsigned Opcode = AMDGPU::V_MOV_B32_e32;
904
+ if (RI.hasAGPRs (RC)) {
862
905
Opcode = RI.hasVGPRs (RI.getPhysRegClass (SrcReg)) ?
863
906
AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::INSTRUCTION_LIST_END;
864
907
} else if (RI.hasVGPRs (RC) && RI.hasAGPRs (RI.getPhysRegClass (SrcReg))) {
865
908
Opcode = AMDGPU::V_ACCVGPR_READ_B32;
866
909
}
867
910
868
911
// For the cases where we need an intermediate instruction/temporary register
869
- // (the result is an SGPR, and the source is either an SGPR or AGPR), we need
870
- // a scavenger.
912
+ // (destination is an AGPR), we need a scavenger.
871
913
//
872
914
// FIXME: The pass should maintain this for us so we don't have to re-scan the
873
915
// whole block for every handled copy.
874
916
std::unique_ptr<RegScavenger> RS;
875
917
if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
876
918
RS.reset (new RegScavenger ());
877
919
878
- ArrayRef<int16_t > SubIndices = RI.getRegSplitParts (RC, EltSize);
879
- bool Forward = RI.getHWRegIndex (DestReg) <= RI.getHWRegIndex (SrcReg);
920
+ ArrayRef<int16_t > SubIndices = RI.getRegSplitParts (RC, 4 );
880
921
881
922
for (unsigned Idx = 0 ; Idx < SubIndices.size (); ++Idx) {
882
923
unsigned SubIdx;
@@ -885,7 +926,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
885
926
else
886
927
SubIdx = SubIndices[SubIndices.size () - Idx - 1 ];
887
928
888
-
889
929
bool UseKill = KillSrc && Idx == SubIndices.size () - 1 ;
890
930
891
931
if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
0 commit comments