Skip to content

Commit e7632a7

Browse files
committed
[AMDGPU][True16] Support V_CEIL_F16.
As not all fake instructions have their real counterparts implemented yet, we specify no AssemblerPredicate for UseFakeTrue16Insts to allow both fake and real True16 instructions in assembler and disassembler tests in the -mattr=+real-true16 mode during the transition period. Source DPP and desitnation VOPDstOperand_t16 operands are still not supported and will be addressed separately.
1 parent a889cbf commit e7632a7

14 files changed

+289
-160
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,10 +1342,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
13421342
unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
13431343
unsigned &RegWidth, SmallVectorImpl<AsmToken> &Tokens);
13441344
bool ParseRegRange(unsigned& Num, unsigned& Width);
1345-
unsigned getRegularReg(RegisterKind RegKind,
1346-
unsigned RegNum,
1347-
unsigned RegWidth,
1348-
SMLoc Loc);
1345+
unsigned getRegularReg(RegisterKind RegKind, unsigned RegNum, unsigned SubReg,
1346+
unsigned RegWidth, SMLoc Loc);
13491347

13501348
bool isRegister();
13511349
bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -2616,6 +2614,8 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token,
26162614
StringRef RegName = Reg->Name;
26172615
StringRef RegSuffix = Str.substr(RegName.size());
26182616
if (!RegSuffix.empty()) {
2617+
RegSuffix.consume_back(".l");
2618+
RegSuffix.consume_back(".h");
26192619
unsigned Num;
26202620
// A single register with an index: rXX
26212621
if (getRegNum(RegSuffix, Num))
@@ -2636,12 +2636,9 @@ AMDGPUAsmParser::isRegister()
26362636
return isRegister(getToken(), peekToken());
26372637
}
26382638

2639-
unsigned
2640-
AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
2641-
unsigned RegNum,
2642-
unsigned RegWidth,
2643-
SMLoc Loc) {
2644-
2639+
unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
2640+
unsigned SubReg, unsigned RegWidth,
2641+
SMLoc Loc) {
26452642
assert(isRegularReg(RegKind));
26462643

26472644
unsigned AlignSize = 1;
@@ -2670,7 +2667,17 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
26702667
return AMDGPU::NoRegister;
26712668
}
26722669

2673-
return RC.getRegister(RegIdx);
2670+
unsigned Reg = RC.getRegister(RegIdx);
2671+
2672+
if (SubReg) {
2673+
Reg = TRI->getSubReg(Reg, SubReg);
2674+
2675+
// Currently all regular registers have their .l and .h subregisters, so
2676+
// we should never need to generate an error here.
2677+
assert(Reg && "Invalid subregister!");
2678+
}
2679+
2680+
return Reg;
26742681
}
26752682

26762683
bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
@@ -2748,7 +2755,17 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
27482755

27492756
RegKind = RI->Kind;
27502757
StringRef RegSuffix = RegName.substr(RI->Name.size());
2758+
unsigned SubReg = NoSubRegister;
27512759
if (!RegSuffix.empty()) {
2760+
// We don't know the opcode till we are done parsing, so we don't know if
2761+
// registers should be 16 or 32 bit. It is therefore mandatory to put .l or
2762+
// .h to correctly specify 16 bit registers. We also can't determine class
2763+
// VGPR_16_Lo128 or VGPR_16, so always parse them as VGPR_16.
2764+
if (RegSuffix.consume_back(".l"))
2765+
SubReg = AMDGPU::lo16;
2766+
else if (RegSuffix.consume_back(".h"))
2767+
SubReg = AMDGPU::hi16;
2768+
27522769
// Single 32-bit register: vXX.
27532770
if (!getRegNum(RegSuffix, RegNum)) {
27542771
Error(Loc, "invalid register index");
@@ -2761,7 +2778,7 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
27612778
return AMDGPU::NoRegister;
27622779
}
27632780

2764-
return getRegularReg(RegKind, RegNum, RegWidth, Loc);
2781+
return getRegularReg(RegKind, RegNum, SubReg, RegWidth, Loc);
27652782
}
27662783

27672784
unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
@@ -2813,7 +2830,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
28132830
}
28142831

28152832
if (isRegularReg(RegKind))
2816-
Reg = getRegularReg(RegKind, RegNum, RegWidth, ListLoc);
2833+
Reg = getRegularReg(RegKind, RegNum, NoSubRegister, RegWidth, ListLoc);
28172834

28182835
return Reg;
28192836
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5276,10 +5276,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
52765276
case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
52775277
case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
52785278
case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5279-
case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64;
5280-
case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64;
5281-
case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64;
5282-
case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64;
5279+
case AMDGPU::S_CEIL_F16:
5280+
return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5281+
: AMDGPU::V_CEIL_F16_fake16_e64;
5282+
case AMDGPU::S_FLOOR_F16:
5283+
return AMDGPU::V_FLOOR_F16_fake16_e64;
5284+
case AMDGPU::S_TRUNC_F16:
5285+
return AMDGPU::V_TRUNC_F16_fake16_e64;
5286+
case AMDGPU::S_RNDNE_F16:
5287+
return AMDGPU::V_RNDNE_F16_fake16_e64;
52835288
case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
52845289
case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
52855290
case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
@@ -5328,15 +5333,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
53285333
case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
53295334
case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
53305335
case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5331-
case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64;
5336+
case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
53325337
case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5333-
case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64;
5338+
case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
53345339
case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5335-
case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64;
5340+
case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
53365341
case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5337-
case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64;
5342+
case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
53385343
case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5339-
case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64;
5344+
case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
53405345
}
53415346
llvm_unreachable(
53425347
"Unexpected scalar opcode without corresponding vector one!");
@@ -7266,8 +7271,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
72667271
if (AMDGPU::getNamedOperandIdx(NewOpcode,
72677272
AMDGPU::OpName::src0_modifiers) >= 0)
72687273
NewInstr.addImm(0);
7269-
if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0) >= 0)
7270-
NewInstr->addOperand(Inst.getOperand(1));
7274+
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7275+
MachineOperand Src = Inst.getOperand(1);
7276+
if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7277+
Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7278+
NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7279+
else
7280+
NewInstr->addOperand(Src);
7281+
}
72717282

72727283
if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
72737284
// We are converting these to a BFE, so we need to add the missing

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
7474

7575
// copy relevant pseudo op flags
7676
let SubtargetPredicate = ps.SubtargetPredicate;
77+
let OtherPredicates = ps.OtherPredicates;
7778
let AsmMatchConverter = ps.AsmMatchConverter;
7879
let AsmVariantName = ps.AsmVariantName;
7980
let Constraints = ps.Constraints;
@@ -157,8 +158,11 @@ multiclass VOP1Inst_t16<string opName,
157158
let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts] in {
158159
defm NAME : VOP1Inst<opName, P, node>;
159160
}
160-
let OtherPredicates = [HasTrue16BitInsts] in {
161-
defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_Fake16<P>, node>;
161+
let OtherPredicates = [UseRealTrue16Insts] in {
162+
defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_True16<P>, node>;
163+
}
164+
let OtherPredicates = [UseFakeTrue16Insts] in {
165+
defm _fake16 : VOP1Inst<opName#"_fake16", VOPProfile_Fake16<P>, node>;
162166
}
163167
}
164168

@@ -679,6 +683,7 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1
679683
let SchedRW = ps.SchedRW;
680684
let Uses = ps.Uses;
681685
let TRANS = ps.TRANS;
686+
let OtherPredicates = ps.OtherPredicates;
682687

683688
bits<8> vdst;
684689
let Inst{8-0} = 0xfa;
@@ -707,6 +712,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
707712
let Defs = ps.Defs;
708713
let SchedRW = ps.SchedRW;
709714
let Uses = ps.Uses;
715+
let OtherPredicates = ps.OtherPredicates;
710716

711717
bits<8> vdst;
712718
let Inst{8-0} = fi;
@@ -742,7 +748,9 @@ multiclass VOP1_Real_e32<GFXGen Gen, bits<9> op, string opName = NAME> {
742748
multiclass VOP1_Real_e32_with_name<GFXGen Gen, bits<9> op, string opName,
743749
string asmName> {
744750
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
745-
let AsmString = asmName # ps.AsmOperands in {
751+
let AsmString = asmName # ps.AsmOperands,
752+
DecoderNamespace = Gen.DecoderNamespace #
753+
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
746754
defm NAME : VOP1_Real_e32<Gen, op, opName>;
747755
}
748756
}
@@ -761,7 +769,9 @@ multiclass VOP1_Real_dpp<GFXGen Gen, bits<9> op, string opName = NAME> {
761769
multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName,
762770
string asmName> {
763771
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
764-
let AsmString = asmName # ps.Pfl.AsmDPP16 in {
772+
let AsmString = asmName # ps.Pfl.AsmDPP16,
773+
DecoderNamespace = "DPP" # Gen.DecoderNamespace #
774+
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
765775
defm NAME : VOP1_Real_dpp<Gen, op, opName>;
766776
}
767777
}
@@ -774,7 +784,9 @@ multiclass VOP1_Real_dpp8<GFXGen Gen, bits<9> op, string opName = NAME> {
774784
multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
775785
string asmName> {
776786
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
777-
let AsmString = asmName # ps.Pfl.AsmDPP8 in {
787+
let AsmString = asmName # ps.Pfl.AsmDPP8,
788+
DecoderNamespace = "DPP8" # Gen.DecoderNamespace #
789+
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
778790
defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
779791
}
780792
}
@@ -854,29 +866,30 @@ defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
854866
"V_FFBH_I32", "v_cls_i32">;
855867
defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>;
856868
defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
857-
defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
858-
defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
859-
defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
869+
defm V_NOT_B16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
870+
defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
871+
defm V_CVT_U32_U16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
860872

861873
defm V_CVT_F16_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x050, "v_cvt_f16_u16">;
862874
defm V_CVT_F16_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
863875
defm V_CVT_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x052, "v_cvt_u16_f16">;
864876
defm V_CVT_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x053, "v_cvt_i16_f16">;
865-
defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
866-
defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
867-
defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
868-
defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
869-
defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
870-
defm V_FREXP_MANT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
877+
defm V_RCP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
878+
defm V_SQRT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
879+
defm V_RSQ_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
880+
defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
881+
defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
882+
defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
871883
defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
872-
defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
884+
defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
873885
defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
874-
defm V_TRUNC_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">;
875-
defm V_RNDNE_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">;
876-
defm V_FRACT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">;
877-
defm V_SIN_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">;
878-
defm V_COS_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">;
879-
defm V_SAT_PK_U8_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
886+
defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
887+
defm V_TRUNC_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">;
888+
defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">;
889+
defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">;
890+
defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">;
891+
defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">;
892+
defm V_SAT_PK_U8_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
880893
defm V_CVT_NORM_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">;
881894
defm V_CVT_NORM_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">;
882895

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,17 @@ body: |
4949
; GFX11: liveins: $vgpr0
5050
; GFX11-NEXT: {{ $}}
5151
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
52-
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
53-
; GFX11-NEXT: $vgpr0 = COPY [[V_CEIL_F16_t16_e64_]]
52+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
53+
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
54+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
55+
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
5456
;
5557
; GFX11-FAKE16-LABEL: name: fceil_s16_vv
5658
; GFX11-FAKE16: liveins: $vgpr0
5759
; GFX11-FAKE16-NEXT: {{ $}}
5860
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
59-
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
60-
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_t16_e64_]]
61+
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
62+
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_fake16_e64_]]
6163
%0:vgpr(s32) = COPY $vgpr0
6264
%1:vgpr(s16) = G_TRUNC %0
6365
%2:vgpr(s16) = G_FCEIL %1
@@ -86,15 +88,16 @@ body: |
8688
; GFX11: liveins: $sgpr0
8789
; GFX11-NEXT: {{ $}}
8890
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
89-
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
90-
; GFX11-NEXT: $vgpr0 = COPY [[V_CEIL_F16_t16_e64_]]
91+
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
92+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
93+
; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
9194
;
9295
; GFX11-FAKE16-LABEL: name: fceil_s16_vs
9396
; GFX11-FAKE16: liveins: $sgpr0
9497
; GFX11-FAKE16-NEXT: {{ $}}
9598
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
96-
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
97-
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_t16_e64_]]
99+
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
100+
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_fake16_e64_]]
98101
%0:sgpr(s32) = COPY $sgpr0
99102
%1:sgpr(s16) = G_TRUNC %0
100103
%2:vgpr(s16) = G_FCEIL %1
@@ -123,15 +126,17 @@ body: |
123126
; GFX11: liveins: $vgpr0
124127
; GFX11-NEXT: {{ $}}
125128
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
126-
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
127-
; GFX11-NEXT: $vgpr0 = COPY [[V_CEIL_F16_t16_e64_]]
129+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
130+
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
131+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
132+
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
128133
;
129134
; GFX11-FAKE16-LABEL: name: fceil_fneg_s16_vv
130135
; GFX11-FAKE16: liveins: $vgpr0
131136
; GFX11-FAKE16-NEXT: {{ $}}
132137
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
133-
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
134-
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_t16_e64_]]
138+
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
139+
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_fake16_e64_]]
135140
%0:vgpr(s32) = COPY $vgpr0
136141
%1:vgpr(s16) = G_TRUNC %0
137142
%2:vgpr(s16) = G_FNEG %1

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
2-
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
3-
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
2+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
3+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
44

55
---
66
name: cmp_f16
@@ -62,11 +62,17 @@ body: |
6262
name: ceil_f16
6363
body: |
6464
bb.0:
65-
; GCN-LABEL: name: ceil_f16
66-
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
67-
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
68-
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
69-
; GCN-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
65+
; REAL16-LABEL: name: ceil_f16
66+
; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
67+
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
68+
; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
69+
; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
70+
;
71+
; FAKE16-LABEL: name: ceil_f16
72+
; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
73+
; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
74+
; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
75+
; FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
7076
%0:vgpr_32 = IMPLICIT_DEF
7177
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
7278
%2:sreg_32 = COPY %1:vgpr_32

0 commit comments

Comments
 (0)