-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[AMDGPU][True16][CodeGen] FLAT_load using D16 pseudo instruction #114500
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] FLAT_load using D16 pseudo instruction #114500
Conversation
fecac70
to
ebed252
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesFLAT load/store supporting fake16 format Full diff: https://github.com/llvm/llvm-project/pull/114500.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db74372e9db452..665d43fc8c93c7 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -14,6 +14,12 @@ def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [SDNPWantRoot], -10>;
+class True16D16Table <string hiOp, string loOp> {
+ Instruction T16Op = !cast<Instruction>(NAME);
+ Instruction HiOp = !cast<Instruction>(hiOp);
+ Instruction LoOp = !cast<Instruction>(loOp);
+}
+
//===----------------------------------------------------------------------===//
// FLAT classes
//===----------------------------------------------------------------------===//
@@ -225,6 +231,12 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
+multiclass FLAT_Load_Pseudo_t16<string opName> {
+ def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
+ let True16Predicate = UseRealTrue16Insts in
+ def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
+}
+
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
@@ -242,6 +254,12 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
let enabled_saddr = EnableSaddr;
}
+multiclass FLAT_Store_Pseudo_t16<string opName> {
+ def "" : FLAT_Store_Pseudo<opName, VGPR_32>;
+ let True16Predicate = UseRealTrue16Insts in
+ def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
+}
+
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
let is_flat_global = 1 in {
def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
@@ -653,8 +671,6 @@ def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_BYTE : FLAT_Store_Pseudo <"flat_store_byte", VGPR_32>;
-def FLAT_STORE_SHORT : FLAT_Store_Pseudo <"flat_store_short", VGPR_32>;
def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
@@ -662,18 +678,21 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
-def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
-def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
+defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
-def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
+defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
+defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
}
def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>;
}
+defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">;
+defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">;
+
defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
VGPR_32, i32, v2i32, VReg_64>;
@@ -1044,6 +1063,11 @@ class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> :
(inst $vaddr, $offset, 0, $in)
>;
+class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (vt (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset))),
+ (inst $vaddr, $offset, (i32 0))
+>;
+
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1366,16 +1390,22 @@ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+ let True16Predicate = p in {
+ def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+ def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
+ def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
+ def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
+ def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
+}
+
def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
@@ -1454,9 +1484,6 @@ let SubtargetPredicate = isGFX12Plus in {
defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
}
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
-def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-
let OtherPredicates = [HasD16LoadStore] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index fce50b741bb63b..6f0b8c0d0f17d4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2419,8 +2419,15 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
// Return an AGPR+VGPR operand class for the given VGPR register class.
class getLdStRegisterOperand<RegisterClass RC> {
+ // This type of operands is only used in pseudo instructions helping
+ // code generation and thus doesn't need encoding and decoding methods.
+ // It also doesn't need to support AGPRs, because GFX908/A/40 do not
+ // support True16.
+ defvar VLdSt_16 = RegisterOperand<VGPR_16>;
+
RegisterOperand ret =
- !cond(!eq(RC.Size, 32) : AVLdSt_32,
+ !cond(!eq(RC.Size, 16) : VLdSt_16,
+ !eq(RC.Size, 32) : AVLdSt_32,
!eq(RC.Size, 64) : AVLdSt_64,
!eq(RC.Size, 96) : AVLdSt_96,
!eq(RC.Size, 128) : AVLdSt_128,
|
3537b4f
to
11c4a9d
Compare
11c4a9d
to
24013a1
Compare
24013a1
to
cf75667
Compare
Rebased and resolved conflicts |
@@ -132,6 +133,35 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { | |||
OutMI.addOperand(Dest); | |||
OutMI.addOperand(Src); | |||
return; | |||
} else if (const auto *Info = AMDGPU::getT16D16Helper(Opcode)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No else after return
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
moved to another function
MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx); | ||
bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI); | ||
Opcode = IsHi ? Info->HiOp : Info->LoOp; | ||
MIVDstOrVData.clearParent(); // Avoid use list error in setReg call | ||
MIVDstOrVData.setReg(TRI.get32BitRegister(MIVDstOrVData.getReg())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't use MachineOperands by value. You shouldn't be fixing up MIR when you're just trying to pass this into an MCInst, just set the regitser directly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed fixing up MIR
lowerOperand(MO, MCOp); | ||
OutMI.addOperand(MCOp); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand why this needs so much custom fixing up
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Matt. This is lowering D16 pseudo instruction(load/store) into D16_lo/D16_hi MCInst base on the .l/.h of the VDst/VData. Added a helper function and add more comments
cf75667
to
d8d193f
Compare
ping! Sorry for the delay. Just updated this patch and it's ready for another round of review |
@@ -113,9 +113,50 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, | |||
llvm_unreachable("unknown operand type"); | |||
} | |||
|
|||
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { | |||
// Lower true16 D16 Opcode to d16_lo/d16_hi MCInst base on Dst/Data's .l/.h |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Lower true16 D16 Opcode to d16_lo/d16_hi MCInst base on Dst/Data's .l/.h | |
// Lower true16 D16 pseudo instruction to d16_lo/d16_hi MCInst based on Dst/Data's .l/.h |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated
// select Dst/Data | ||
int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); | ||
int VDstOrVDataIdx = VDataIdx != -1 ? VDataIdx : 0; | ||
MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx); | |
MachineOperand& MIVDstOrVData = MI->getOperand(VDstOrVDataIdx); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated
d8d193f
to
05aba18
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
05aba18
to
e7ca457
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/203/builds/2220 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/159/builds/16253 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/16415 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/15998 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/177/builds/13191 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/204/builds/1028 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/13380 Here is the relevant piece of the build log for the reference
|
Would it be possible to revert this change? It broke quite a few build bots. Edit: already reverted, thank you Nikita! |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/200/builds/3828 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/13321 Here is the relevant piece of the build log for the reference
|
Thanks Nikita and Chrisitian! I'll take a look in this |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/18870 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/13956 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/19851 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/23170 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/11530 Here is the relevant piece of the build log for the reference
|
…do instruction" (#127673) Previous patch is merged llvm/llvm-project#114500 and it hit a buildbot failure and thus reverted It seems the AMDGPU::OpName::OPERAND_LAST is removed at the meantime when previous patch is merged and that's causing the compile error. Fixed and reopen it here
…m#114500) Implement new pseudos with the suffix _t16 for FLAT_LOAD which have VGPR_16 as the load dst. Lower the pseudos to the existing real instructions with VGPR_32 src or dst (which makes them consistent with the hardware encoding). This patch reduces VGPR usage by making hi halves of VGPRs available for other values. There are more 8/16 bits ld/st instructions to be supported in the up-coming patches
…ion (llvm#114500)" This reverts commit f7a5f06. Fails to build with: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp:126:37: error: no member named 'OPERAND_LAST' in 'llvm::AMDGPU::OpName' 126 | uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
…ion" (llvm#127673) Previous patch is merged llvm#114500 and it hit a buildbot failure and thus reverted It seems the AMDGPU::OpName::OPERAND_LAST is removed at the meantime when previous patch is merged and that's causing the compile error. Fixed and reopen it here
Implement new pseudos with the suffix _t16 for FLAT_LOAD which have VGPR_16 as the load dst. Lower the pseudos to the existing real instructions with VGPR_32 src or dst (which makes them consistent with the hardware encoding). This patch reduces VGPR usage by making hi halves of VGPRs available for other values.
There are more 8/16 bits ld/st instructions to be supported in the up-coming patches