Skip to content

Commit 758df22

Browse files
committed
[AMDGPU][True16] Support emitting copies between different register sizes.
Differential Revision: https://reviews.llvm.org/D156105
1 parent 431969e commit 758df22

File tree

2 files changed

+58
-21
lines changed

2 files changed

+58
-21
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -724,24 +724,39 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
724724
const DebugLoc &DL, MCRegister DestReg,
725725
MCRegister SrcReg, bool KillSrc) const {
726726
const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
727+
unsigned Size = RI.getRegSizeInBits(*RC);
728+
const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
729+
unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
730+
731+
// The rest of copyPhysReg assumes Src and Dst size are the same size.
732+
// TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
733+
// we remove Fix16BitCopies and this code block?
734+
if (Fix16BitCopies) {
735+
if (((Size == 16) != (SrcSize == 16))) {
736+
if (ST.hasTrue16BitInsts()) {
737+
// Non-VGPR Src and Dst will later be expanded back to 32 bits.
738+
MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
739+
MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
740+
RegToFix = SubReg;
741+
} else {
742+
MCRegister &RegToFix = (Size == 16) ? DestReg : SrcReg;
743+
MCRegister Super = RI.get32BitRegister(RegToFix);
744+
assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix ||
745+
RI.getSubReg(Super, AMDGPU::hi16) == RegToFix);
746+
RegToFix = Super;
747+
}
727748

728-
// FIXME: This is hack to resolve copies between 16 bit and 32 bit
729-
// registers until all patterns are fixed.
730-
if (Fix16BitCopies &&
731-
((RI.getRegSizeInBits(*RC) == 16) ^
732-
(RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) {
733-
MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
734-
MCRegister Super = RI.get32BitRegister(RegToFix);
735-
assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
736-
RegToFix = Super;
737-
738-
if (DestReg == SrcReg) {
739-
// Insert empty bundle since ExpandPostRA expects an instruction here.
740-
BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
741-
return;
749+
if (DestReg == SrcReg) {
750+
// Identity copy. Insert empty bundle since ExpandPostRA expects an
751+
// instruction here.
752+
BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
753+
return;
754+
}
755+
RC = RI.getPhysRegBaseClass(DestReg);
756+
Size = RI.getRegSizeInBits(*RC);
757+
SrcRC = RI.getPhysRegBaseClass(SrcReg);
758+
SrcSize = RI.getRegSizeInBits(*SrcRC);
742759
}
743-
744-
RC = RI.getPhysRegBaseClass(DestReg);
745760
}
746761

747762
if (RC == &AMDGPU::VGPR_32RegClass) {
@@ -865,10 +880,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
865880
return;
866881
}
867882

868-
const unsigned Size = RI.getRegSizeInBits(*RC);
869883
if (Size == 16) {
870-
assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
871-
AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
884+
assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
872885
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
873886
AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
874887

@@ -906,6 +919,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
906919
return;
907920
}
908921

922+
if (ST.hasTrue16BitInsts()) {
923+
if (IsSGPRSrc) {
924+
assert(SrcLow);
925+
SrcReg = NewSrcReg;
926+
}
927+
// Use the smaller instruction encoding if possible.
928+
if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
929+
(IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
930+
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
931+
.addReg(SrcReg);
932+
} else {
933+
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
934+
.addImm(0) // src0_modifiers
935+
.addReg(SrcReg)
936+
.addImm(0); // op_sel
937+
}
938+
return;
939+
}
940+
909941
if (IsSGPRSrc && !ST.hasSDWAScalar()) {
910942
if (!DstLow || !SrcLow) {
911943
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
@@ -932,7 +964,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
932964
return;
933965
}
934966

935-
const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
936967
if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
937968
if (ST.hasMovB64()) {
938969
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
@@ -1288,7 +1319,11 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
12881319

12891320
if (RI.isAGPRClass(DstRC))
12901321
return AMDGPU::COPY;
1291-
if (RI.getRegSizeInBits(*DstRC) == 32) {
1322+
if (RI.getRegSizeInBits(*DstRC) == 16) {
1323+
// Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1324+
// before RA.
1325+
return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1326+
} else if (RI.getRegSizeInBits(*DstRC) == 32) {
12921327
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
12931328
} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
12941329
return AMDGPU::S_MOV_B64;

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,7 @@ let SubtargetPredicate = isGFX11Plus in {
656656
getVOP1Pat64<int_amdgcn_permlane64,
657657
VOP_MOVRELS>.ret,
658658
/*VOP1Only=*/ 1>;
659+
defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
659660
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
660661
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
661662
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
@@ -804,6 +805,7 @@ defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11<0x03a,
804805
defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b,
805806
"V_FFBH_I32", "v_cls_i32">;
806807
defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>;
808+
defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x01c, "v_mov_b16">;
807809
defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x069, "v_not_b16">;
808810
defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x06a, "v_cvt_i32_i16">;
809811
defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x06b, "v_cvt_u32_u16">;

0 commit comments

Comments
 (0)