-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16] Support V_CEIL_F16. #73108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Ivan Kosarev (kosarev) ChangesAs not all fake instructions have their real counterparts implemented Source DPP and desitnation VOPDstOperand_t16 operands are still not Patch is 51.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73108.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index be74c627d213756..5f20cc7310fdc05 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -50,6 +50,8 @@ class AMDGPUAsmParser;
enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_AGPR, IS_TTMP, IS_SPECIAL };
+enum class RegisterSuffix { None, Lo, Hi };
+
//===----------------------------------------------------------------------===//
// Operand
//===----------------------------------------------------------------------===//
@@ -1333,10 +1335,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
unsigned &RegWidth, SmallVectorImpl<AsmToken> &Tokens);
bool ParseRegRange(unsigned& Num, unsigned& Width);
- unsigned getRegularReg(RegisterKind RegKind,
- unsigned RegNum,
- unsigned RegWidth,
- SMLoc Loc);
+ unsigned getRegularReg(RegisterKind RegKind, unsigned RegNum,
+ unsigned RegWidth, RegisterSuffix Suffix, SMLoc Loc);
bool isRegister();
bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -2303,7 +2303,17 @@ bool AMDGPUOperand::isInlineValue() const {
// AsmParser
//===----------------------------------------------------------------------===//
-static int getRegClass(RegisterKind Is, unsigned RegWidth) {
+static int getRegClass(RegisterKind Is, unsigned RegWidth,
+ RegisterSuffix Suffix) {
+ if (Suffix != RegisterSuffix::None) {
+ if (Is == IS_VGPR && RegWidth == 16) {
+ if (Suffix == RegisterSuffix::Lo)
+ return AMDGPU::VGPR_LO16RegClassID;
+ assert(Suffix == RegisterSuffix::Hi);
+ return AMDGPU::VGPR_HI16RegClassID;
+ }
+ return -1;
+ }
if (Is == IS_VGPR) {
switch (RegWidth) {
default: return -1;
@@ -2590,8 +2600,10 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token,
StringRef RegName = Reg->Name;
StringRef RegSuffix = Str.substr(RegName.size());
if (!RegSuffix.empty()) {
- unsigned Num;
+ RegSuffix.consume_back(".l");
+ RegSuffix.consume_back(".h");
// A single register with an index: rXX
+ unsigned Num;
if (getRegNum(RegSuffix, Num))
return true;
} else {
@@ -2610,12 +2622,9 @@ AMDGPUAsmParser::isRegister()
return isRegister(getToken(), peekToken());
}
-unsigned
-AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
- unsigned RegNum,
- unsigned RegWidth,
- SMLoc Loc) {
-
+unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
+ unsigned RegWidth,
+ RegisterSuffix Suffix, SMLoc Loc) {
assert(isRegularReg(RegKind));
unsigned AlignSize = 1;
@@ -2631,7 +2640,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
}
unsigned RegIdx = RegNum / AlignSize;
- int RCID = getRegClass(RegKind, RegWidth);
+ int RCID = getRegClass(RegKind, RegWidth, Suffix);
if (RCID == -1) {
Error(Loc, "invalid or unsupported register size");
return AMDGPU::NoRegister;
@@ -2722,20 +2731,30 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
RegKind = RI->Kind;
StringRef RegSuffix = RegName.substr(RI->Name.size());
+ RegisterSuffix SuffixKind = RegisterSuffix::None;
if (!RegSuffix.empty()) {
+ if (RegSuffix.consume_back(".l")) {
+ RegWidth = 16;
+ SuffixKind = RegisterSuffix::Lo;
+ } else if (RegSuffix.consume_back(".h")) {
+ RegWidth = 16;
+ SuffixKind = RegisterSuffix::Hi;
+ } else {
+ RegWidth = 32;
+ }
+
// Single 32-bit register: vXX.
if (!getRegNum(RegSuffix, RegNum)) {
Error(Loc, "invalid register index");
return AMDGPU::NoRegister;
}
- RegWidth = 32;
} else {
// Range of registers: v[XX:YY]. ":YY" is optional.
if (!ParseRegRange(RegNum, RegWidth))
return AMDGPU::NoRegister;
}
- return getRegularReg(RegKind, RegNum, RegWidth, Loc);
+ return getRegularReg(RegKind, RegNum, RegWidth, SuffixKind, Loc);
}
unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
@@ -2786,8 +2805,10 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
return AMDGPU::NoRegister;
}
- if (isRegularReg(RegKind))
- Reg = getRegularReg(RegKind, RegNum, RegWidth, ListLoc);
+ if (isRegularReg(RegKind)) {
+ Reg =
+ getRegularReg(RegKind, RegNum, RegWidth, RegisterSuffix::None, ListLoc);
+ }
return Reg;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c4baabcd9232b56..94fa9fa4cc2aa5e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5250,10 +5250,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
- case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64;
- case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64;
- case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64;
- case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64;
+ case AMDGPU::S_CEIL_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
+ : AMDGPU::V_CEIL_F16_fake16_e64;
+ case AMDGPU::S_FLOOR_F16:
+ return AMDGPU::V_FLOOR_F16_fake16_e64;
+ case AMDGPU::S_TRUNC_F16:
+ return AMDGPU::V_TRUNC_F16_fake16_e64;
+ case AMDGPU::S_RNDNE_F16:
+ return AMDGPU::V_RNDNE_F16_fake16_e64;
case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 53b0513c85d8864..3850f5ef477bd7d 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -74,6 +74,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
@@ -151,8 +152,11 @@ multiclass VOP1Inst_t16<string opName,
let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts] in {
defm NAME : VOP1Inst<opName, P, node>;
}
- let OtherPredicates = [HasTrue16BitInsts] in {
- defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_Fake16<P>, node>;
+ let OtherPredicates = [UseRealTrue16Insts] in {
+ defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_True16<P>, node>;
+ }
+ let OtherPredicates = [UseFakeTrue16Insts] in {
+ defm _fake16 : VOP1Inst<opName#"_fake16", VOPProfile_Fake16<P>, node>;
}
}
@@ -673,6 +677,7 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1
let SchedRW = ps.SchedRW;
let Uses = ps.Uses;
let TRANS = ps.TRANS;
+ let OtherPredicates = ps.OtherPredicates;
bits<8> vdst;
let Inst{8-0} = 0xfa;
@@ -694,6 +699,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
let Defs = ps.Defs;
let SchedRW = ps.SchedRW;
let Uses = ps.Uses;
+ let OtherPredicates = ps.OtherPredicates;
bits<8> vdst;
let Inst{8-0} = fi;
@@ -706,15 +712,16 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
// GFX11.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+let AssemblerPredicate = isGFX11Only in {
multiclass VOP1Only_Real_gfx11<bits<9> op> {
- let IsSingle = 1 in
+ let DecoderNamespace = "GFX11", IsSingle = 1 in
def _gfx11 :
VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX11>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
}
multiclass VOP1_Real_e32_gfx11<bits<9> op, string opName = NAME> {
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ let DecoderNamespace = "GFX11" in
def _e32_gfx11 :
VOP1_Real<ps, SIEncodingFamily.GFX11>,
VOP1e<op{7-0}, ps.Pfl>;
@@ -722,11 +729,13 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
multiclass VOP1_Real_e32_with_name_gfx11<bits<9> op, string opName,
string asmName> {
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- let AsmString = asmName # ps.AsmOperands in {
+ let AsmString = asmName # ps.AsmOperands,
+ DecoderNamespace = !if(ps.Pfl.IsRealTrue16, "GFX11", "GFX11_FAKE16") in {
defm NAME : VOP1_Real_e32_gfx11<op, opName>;
}
}
multiclass VOP1_Real_e64_gfx11<bits<9> op> {
+ let DecoderNamespace = "GFX11" in
def _e64_gfx11 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>,
VOP3e_gfx11<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
@@ -740,7 +749,9 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
multiclass VOP1_Real_dpp_with_name_gfx11<bits<9> op, string opName,
string asmName> {
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- let AsmString = asmName # ps.Pfl.AsmDPP16, DecoderNamespace = "DPPGFX11" in {
+ let AsmString = asmName # ps.Pfl.AsmDPP16,
+ DecoderNamespace = !if(ps.Pfl.IsRealTrue16,
+ "DPPGFX11", "DPPGFX11_FAKE16") in {
defm NAME : VOP1_Real_dpp_gfx11<op, opName>;
}
}
@@ -753,11 +764,13 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
multiclass VOP1_Real_dpp8_with_name_gfx11<bits<9> op, string opName,
string asmName> {
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- let AsmString = asmName # ps.Pfl.AsmDPP8, DecoderNamespace = "DPP8GFX11" in {
+ let AsmString = asmName # ps.Pfl.AsmDPP8,
+ DecoderNamespace = !if(ps.Pfl.IsRealTrue16,
+ "DPP8GFX11", "DPP8GFX11_FAKE16") in {
defm NAME : VOP1_Real_dpp8_gfx11<op, opName>;
}
}
-} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+} // End AssemblerPredicate = isGFX11Only
multiclass VOP1_Realtriple_e64_gfx11<bits<9> op> {
defm NAME : VOP3_Realtriple_gfx11<{0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
@@ -806,29 +819,30 @@ defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b,
"V_FFBH_I32", "v_cls_i32">;
defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>;
defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x01c, "v_mov_b16">;
-defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x069, "v_not_b16">;
-defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x06a, "v_cvt_i32_i16">;
-defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x06b, "v_cvt_u32_u16">;
+defm V_NOT_B16_fake16 : VOP1_Real_FULL_t16_gfx11<0x069, "v_not_b16">;
+defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11<0x06a, "v_cvt_i32_i16">;
+defm V_CVT_U32_U16_fake16 : VOP1_Real_FULL_t16_gfx11<0x06b, "v_cvt_u32_u16">;
defm V_CVT_F16_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x050, "v_cvt_f16_u16">;
defm V_CVT_F16_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x051, "v_cvt_f16_i16">;
defm V_CVT_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x052, "v_cvt_u16_f16">;
defm V_CVT_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x053, "v_cvt_i16_f16">;
-defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x054, "v_rcp_f16">;
-defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x055, "v_sqrt_f16">;
-defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x056, "v_rsq_f16">;
-defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x057, "v_log_f16">;
-defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x058, "v_exp_f16">;
-defm V_FREXP_MANT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x059, "v_frexp_mant_f16">;
+defm V_RCP_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x054, "v_rcp_f16">;
+defm V_SQRT_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x055, "v_sqrt_f16">;
+defm V_RSQ_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x056, "v_rsq_f16">;
+defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x057, "v_log_f16">;
+defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x058, "v_exp_f16">;
+defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x059, "v_frexp_mant_f16">;
defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05a, "v_frexp_exp_i16_f16">;
-defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05b, "v_floor_f16">;
+defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x05b, "v_floor_f16">;
defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05c, "v_ceil_f16">;
-defm V_TRUNC_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05d, "v_trunc_f16">;
-defm V_RNDNE_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05e, "v_rndne_f16">;
-defm V_FRACT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05f, "v_fract_f16">;
-defm V_SIN_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x060, "v_sin_f16">;
-defm V_COS_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x061, "v_cos_f16">;
-defm V_SAT_PK_U8_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x062, "v_sat_pk_u8_i16">;
+defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x05c, "v_ceil_f16">;
+defm V_TRUNC_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x05d, "v_trunc_f16">;
+defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x05e, "v_rndne_f16">;
+defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x05f, "v_fract_f16">;
+defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x060, "v_sin_f16">;
+defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11<0x061, "v_cos_f16">;
+defm V_SAT_PK_U8_I16_fake16 : VOP1_Real_FULL_t16_gfx11<0x062, "v_sat_pk_u8_i16">;
defm V_CVT_NORM_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x063, "v_cvt_norm_i16_f16">;
defm V_CVT_NORM_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x064, "v_cvt_norm_u16_f16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index d9ba03f95a1cfba..a270dffe7b595f7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -1,5 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s
---
name: fceil_s16_ss
@@ -36,12 +38,29 @@ body: |
bb.0:
liveins: $vgpr0
- ; GCN-LABEL: name: fceil_s16_vv
- ; GCN: liveins: $vgpr0
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr0 = COPY %2
+ ; GFX8-LABEL: name: fceil_s16_vv
+ ; GFX8: liveins: $vgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
+ ;
+ ; GFX11-LABEL: name: fceil_s16_vv
+ ; GFX11: liveins: $vgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr_16(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[TRUNC]](s16), 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s16) = COPY [[V_CEIL_F16_t16_e64_]]
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY [[COPY1]](s16)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ;
+ ; GFX11-FAKE16-LABEL: name: fceil_s16_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_fake16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_FCEIL %1
@@ -59,12 +78,27 @@ body: |
bb.0:
liveins: $sgpr0
- ; GCN-LABEL: name: fceil_s16_vs
- ; GCN: liveins: $sgpr0
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr0 = COPY %2
+ ; GFX8-LABEL: name: fceil_s16_vs
+ ; GFX8: liveins: $sgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
+ ;
+ ; GFX11-LABEL: name: fceil_s16_vs
+ ; GFX11: liveins: $sgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
+ ;
+ ; GFX11-FAKE16-LABEL: name: fceil_s16_vs
+ ; GFX11-FAKE16: liveins: $sgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_fake16_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_FCEIL %1
@@ -82,12 +116,29 @@ body: |
bb.0:
liveins: $vgpr0
- ; GCN-LABEL: name: fceil_fneg_s16_vv
- ; GCN: liveins: $vgpr0
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN-NEXT: %3:vgpr_32 = nofpexcept V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr0 = COPY %3
+ ; GFX8-LABEL: name: fceil_fneg_s16_vv
+ ; GFX8: liveins: $vgpr0
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
+ ;
+ ; GFX11-LABEL: name: fceil_fneg_s16_vv
+ ; GFX11: liveins: $vgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr_16(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[TRUNC]](s16), 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s16) = COPY [[V_CEIL_F16_t16_e64_]]
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY [[COPY1]](s16)
+ ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+ ;
+ ; GFX11-FAKE16-LABEL: name: fceil_fneg_s16_vv
+ ; GFX11-FAKE16: liveins: $vgpr0
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[V_CEIL_F16...
[truncated]
|
Parsing True16 registers here slightly differs from the downstream implementation where we use an Tests are pre-committed, but the commit is part of this very pull request. Would that be okay? For this pilot change I'm going to prepare a corresponding downstream patch; further upstream changes of the kind should be trivially mergeable, I expect. |
return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64 | ||
: AMDGPU::V_CEIL_F16_fake16_e64; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mbrkusanin Mirko, we don't seem to have any coverage for this S_CEIL_F16
case. Should we add some?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not every instruction is covered, only one per group. "cmp_f16" test from llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir was supposed to cover s_*_f16 insts.
For the fake16 everything should work, but for t16 regclass is different, so this change is not enough. So, yeah we should have at least one test for the t16 ones.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. I've added a test and updated the S_* -> V_* translation to handle the case.
In my opinion yes, that's a good way to do it. It allows reviewers to see the test diffs in the second patch. |
Added some coverage for S_CEIL_F16 and updated asm/disasm tests to be more in line with downstream to reduce the merging pain. |
if (Suffix != RegisterSuffix::None) { | ||
if (Is == IS_VGPR && RegWidth == 16) { | ||
if (Suffix == RegisterSuffix::Lo) | ||
return AMDGPU::VGPR_LO16RegClassID; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VGPR_LO16 and VGPR_HI16 are scarcely used and can be replaced with VGPR_16 at some point in the True16 support timeline. So is it necessary to use them here for the transition ? If so, please put a comment to replace the register class with VGPR_16RegClassID and remove the Suffix parameter. If not, please switch it to VGPR_16RegClassID now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we absolutely sure we want the register classes removed? If not yet, I would hesitate to complicate the code prematurely and suggest that we wait for the change removing the classes and then assess how much that looks a good idea.
How would the comment be useful? Once the classes are removed, there is no way this case could be missed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regardless of the future, I think using VGPR_16 right now would be less complicated. Why use two separate register classes when one will do? We should consolidate functionality on VGPR_16
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think using VGPR_16 right now would be less complicated. Why use two separate register classes when one will do?
It can be seen downstream that using VGPR_16 involves special-casing that is not needed if we just follow the existing design. Looks neither simpler, nor clearer, nor more reliable to me:
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
const MCRegisterClass RC = TRI->getRegClass(RCID);
if (RCID == VGPR_16RegClassID || RCID == VGPR_16_Lo128RegClassID)
// RegIdx depends on register class definition in SIRegisterInfo.td
// Expected register order is alternating lo and hi :
// VGPR0_LO16, VGPR0_HI16, VGPR1_LO16, VGPR1_HI16, ...
RegIdx = RegIdx * 2 + (unsigned)IsHigh;
if (RegIdx >= RC.getNumRegs()) {
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This proposed design uses 2 new register classes (or is it 4 if you need to have VGPR_HI16_Lo128 and VGPR_Lo16_Lo128 as well) whereas the other needs 2 lines to compute RegIdx. I don't think that's warranted.
If you still think this way is better, please prepare a patch downstream to show the benefit. Or alternatively, you could try removing VGPR_HI16 and VGPR_LO16 right away.
It is extremely difficult to review changes that contain 1) porting of downstream code, 2) transition code necessary for the true16 and fake16 versions to coexist and 3) refactoring/alternative implementations. Please separate them as much as possible. Refactoring could be done either downstream in advance of upstreaming, or on upstream code after direct porting.
@@ -2303,7 +2303,17 @@ bool AMDGPUOperand::isInlineValue() const { | |||
// AsmParser | |||
//===----------------------------------------------------------------------===// | |||
|
|||
static int getRegClass(RegisterKind Is, unsigned RegWidth) { | |||
static int getRegClass(RegisterKind Is, unsigned RegWidth, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Is" is weird naming. Kind?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree, but renaming it here would be a bunch of unrelated changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this need an update?
Updated to match the downstream and rebased. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
As not all fake instructions have their real counterparts implemented yet, we specify no AssemblerPredicate for UseFakeTrue16Insts to allow both fake and real True16 instructions in assembler and disassembler tests in the -mattr=+real-true16 mode during the transition period. Source DPP and desitnation VOPDstOperand_t16 operands are still not supported and will be addressed separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! Using subreg in the AsmParser looks clean. LGTM
As not all fake instructions have their real counterparts implemented yet, we specify no AssemblerPredicate for UseFakeTrue16Insts to allow both fake and real True16 instructions in assembler and disassembler tests in the -mattr=+real-true16 mode during the transition period. Source DPP and desitnation VOPDstOperand_t16 operands are still not supported and will be addressed separately.
As not all fake instructions have their real counterparts implemented
yet, we specify no AssemblerPredicate for UseFakeTrue16Insts to allow
both fake and real True16 instructions in assembler and disassembler
tests in the -mattr=+real-true16 mode during the transition period.
Source DPP and desitnation VOPDstOperand_t16 operands are still not
supported and will be addressed separately.