Skip to content

Commit 084f1c2

Browse files
authored
[AMDGPU][True16] Support V_CEIL_F16. (#73108)
As not all fake instructions have their real counterparts implemented yet, we specify no AssemblerPredicate for UseFakeTrue16Insts to allow both fake and real True16 instructions in assembler and disassembler tests in the -mattr=+real-true16 mode during the transition period. Source DPP and desitnation VOPDstOperand_t16 operands are still not supported and will be addressed separately.
1 parent f443fbc commit 084f1c2

19 files changed

+526
-162
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,10 +1342,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
13421342
unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
13431343
unsigned &RegWidth, SmallVectorImpl<AsmToken> &Tokens);
13441344
bool ParseRegRange(unsigned& Num, unsigned& Width);
1345-
unsigned getRegularReg(RegisterKind RegKind,
1346-
unsigned RegNum,
1347-
unsigned RegWidth,
1348-
SMLoc Loc);
1345+
unsigned getRegularReg(RegisterKind RegKind, unsigned RegNum, unsigned SubReg,
1346+
unsigned RegWidth, SMLoc Loc);
13491347

13501348
bool isRegister();
13511349
bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -2616,6 +2614,8 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token,
26162614
StringRef RegName = Reg->Name;
26172615
StringRef RegSuffix = Str.substr(RegName.size());
26182616
if (!RegSuffix.empty()) {
2617+
RegSuffix.consume_back(".l");
2618+
RegSuffix.consume_back(".h");
26192619
unsigned Num;
26202620
// A single register with an index: rXX
26212621
if (getRegNum(RegSuffix, Num))
@@ -2636,12 +2636,9 @@ AMDGPUAsmParser::isRegister()
26362636
return isRegister(getToken(), peekToken());
26372637
}
26382638

2639-
unsigned
2640-
AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
2641-
unsigned RegNum,
2642-
unsigned RegWidth,
2643-
SMLoc Loc) {
2644-
2639+
unsigned AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
2640+
unsigned SubReg, unsigned RegWidth,
2641+
SMLoc Loc) {
26452642
assert(isRegularReg(RegKind));
26462643

26472644
unsigned AlignSize = 1;
@@ -2670,7 +2667,17 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
26702667
return AMDGPU::NoRegister;
26712668
}
26722669

2673-
return RC.getRegister(RegIdx);
2670+
unsigned Reg = RC.getRegister(RegIdx);
2671+
2672+
if (SubReg) {
2673+
Reg = TRI->getSubReg(Reg, SubReg);
2674+
2675+
// Currently all regular registers have their .l and .h subregisters, so
2676+
// we should never need to generate an error here.
2677+
assert(Reg && "Invalid subregister!");
2678+
}
2679+
2680+
return Reg;
26742681
}
26752682

26762683
bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
@@ -2748,7 +2755,17 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
27482755

27492756
RegKind = RI->Kind;
27502757
StringRef RegSuffix = RegName.substr(RI->Name.size());
2758+
unsigned SubReg = NoSubRegister;
27512759
if (!RegSuffix.empty()) {
2760+
// We don't know the opcode till we are done parsing, so we don't know if
2761+
// registers should be 16 or 32 bit. It is therefore mandatory to put .l or
2762+
// .h to correctly specify 16 bit registers. We also can't determine class
2763+
// VGPR_16_Lo128 or VGPR_16, so always parse them as VGPR_16.
2764+
if (RegSuffix.consume_back(".l"))
2765+
SubReg = AMDGPU::lo16;
2766+
else if (RegSuffix.consume_back(".h"))
2767+
SubReg = AMDGPU::hi16;
2768+
27522769
// Single 32-bit register: vXX.
27532770
if (!getRegNum(RegSuffix, RegNum)) {
27542771
Error(Loc, "invalid register index");
@@ -2761,7 +2778,7 @@ unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
27612778
return AMDGPU::NoRegister;
27622779
}
27632780

2764-
return getRegularReg(RegKind, RegNum, RegWidth, Loc);
2781+
return getRegularReg(RegKind, RegNum, SubReg, RegWidth, Loc);
27652782
}
27662783

27672784
unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
@@ -2813,7 +2830,7 @@ unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
28132830
}
28142831

28152832
if (isRegularReg(RegKind))
2816-
Reg = getRegularReg(RegKind, RegNum, RegWidth, ListLoc);
2833+
Reg = getRegularReg(RegKind, RegNum, NoSubRegister, RegWidth, ListLoc);
28172834

28182835
return Reg;
28192836
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5276,10 +5276,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
52765276
case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
52775277
case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
52785278
case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5279-
case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64;
5280-
case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64;
5281-
case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64;
5282-
case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64;
5279+
case AMDGPU::S_CEIL_F16:
5280+
return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5281+
: AMDGPU::V_CEIL_F16_fake16_e64;
5282+
case AMDGPU::S_FLOOR_F16:
5283+
return AMDGPU::V_FLOOR_F16_fake16_e64;
5284+
case AMDGPU::S_TRUNC_F16:
5285+
return AMDGPU::V_TRUNC_F16_fake16_e64;
5286+
case AMDGPU::S_RNDNE_F16:
5287+
return AMDGPU::V_RNDNE_F16_fake16_e64;
52835288
case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
52845289
case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
52855290
case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
@@ -5328,15 +5333,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
53285333
case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
53295334
case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
53305335
case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5331-
case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64;
5336+
case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
53325337
case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5333-
case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64;
5338+
case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
53345339
case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5335-
case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64;
5340+
case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
53365341
case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5337-
case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64;
5342+
case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
53385343
case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5339-
case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64;
5344+
case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
53405345
}
53415346
llvm_unreachable(
53425347
"Unexpected scalar opcode without corresponding vector one!");
@@ -7266,8 +7271,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
72667271
if (AMDGPU::getNamedOperandIdx(NewOpcode,
72677272
AMDGPU::OpName::src0_modifiers) >= 0)
72687273
NewInstr.addImm(0);
7269-
if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0) >= 0)
7270-
NewInstr->addOperand(Inst.getOperand(1));
7274+
if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7275+
MachineOperand Src = Inst.getOperand(1);
7276+
if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7277+
Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7278+
NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7279+
else
7280+
NewInstr->addOperand(Src);
7281+
}
72717282

72727283
if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
72737284
// We are converting these to a BFE, so we need to add the missing

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
7474

7575
// copy relevant pseudo op flags
7676
let SubtargetPredicate = ps.SubtargetPredicate;
77+
let OtherPredicates = ps.OtherPredicates;
7778
let AsmMatchConverter = ps.AsmMatchConverter;
7879
let AsmVariantName = ps.AsmVariantName;
7980
let Constraints = ps.Constraints;
@@ -157,8 +158,11 @@ multiclass VOP1Inst_t16<string opName,
157158
let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts] in {
158159
defm NAME : VOP1Inst<opName, P, node>;
159160
}
160-
let OtherPredicates = [HasTrue16BitInsts] in {
161-
defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_Fake16<P>, node>;
161+
let OtherPredicates = [UseRealTrue16Insts] in {
162+
defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_True16<P>, node>;
163+
}
164+
let OtherPredicates = [UseFakeTrue16Insts] in {
165+
defm _fake16 : VOP1Inst<opName#"_fake16", VOPProfile_Fake16<P>, node>;
162166
}
163167
}
164168

@@ -679,6 +683,7 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1
679683
let SchedRW = ps.SchedRW;
680684
let Uses = ps.Uses;
681685
let TRANS = ps.TRANS;
686+
let OtherPredicates = ps.OtherPredicates;
682687

683688
bits<8> vdst;
684689
let Inst{8-0} = 0xfa;
@@ -707,6 +712,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
707712
let Defs = ps.Defs;
708713
let SchedRW = ps.SchedRW;
709714
let Uses = ps.Uses;
715+
let OtherPredicates = ps.OtherPredicates;
710716

711717
bits<8> vdst;
712718
let Inst{8-0} = fi;
@@ -742,7 +748,9 @@ multiclass VOP1_Real_e32<GFXGen Gen, bits<9> op, string opName = NAME> {
742748
multiclass VOP1_Real_e32_with_name<GFXGen Gen, bits<9> op, string opName,
743749
string asmName> {
744750
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
745-
let AsmString = asmName # ps.AsmOperands in {
751+
let AsmString = asmName # ps.AsmOperands,
752+
DecoderNamespace = Gen.DecoderNamespace #
753+
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
746754
defm NAME : VOP1_Real_e32<Gen, op, opName>;
747755
}
748756
}
@@ -761,7 +769,9 @@ multiclass VOP1_Real_dpp<GFXGen Gen, bits<9> op, string opName = NAME> {
761769
multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName,
762770
string asmName> {
763771
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
764-
let AsmString = asmName # ps.Pfl.AsmDPP16 in {
772+
let AsmString = asmName # ps.Pfl.AsmDPP16,
773+
DecoderNamespace = "DPP" # Gen.DecoderNamespace #
774+
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
765775
defm NAME : VOP1_Real_dpp<Gen, op, opName>;
766776
}
767777
}
@@ -774,7 +784,9 @@ multiclass VOP1_Real_dpp8<GFXGen Gen, bits<9> op, string opName = NAME> {
774784
multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
775785
string asmName> {
776786
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
777-
let AsmString = asmName # ps.Pfl.AsmDPP8 in {
787+
let AsmString = asmName # ps.Pfl.AsmDPP8,
788+
DecoderNamespace = "DPP8" # Gen.DecoderNamespace #
789+
!if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
778790
defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
779791
}
780792
}
@@ -854,29 +866,30 @@ defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
854866
"V_FFBH_I32", "v_cls_i32">;
855867
defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>;
856868
defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
857-
defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
858-
defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
859-
defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
869+
defm V_NOT_B16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
870+
defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
871+
defm V_CVT_U32_U16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
860872

861873
defm V_CVT_F16_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x050, "v_cvt_f16_u16">;
862874
defm V_CVT_F16_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
863875
defm V_CVT_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x052, "v_cvt_u16_f16">;
864876
defm V_CVT_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x053, "v_cvt_i16_f16">;
865-
defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
866-
defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
867-
defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
868-
defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
869-
defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
870-
defm V_FREXP_MANT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
877+
defm V_RCP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
878+
defm V_SQRT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
879+
defm V_RSQ_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
880+
defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
881+
defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
882+
defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
871883
defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
872-
defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
884+
defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
873885
defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
874-
defm V_TRUNC_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">;
875-
defm V_RNDNE_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">;
876-
defm V_FRACT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">;
877-
defm V_SIN_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">;
878-
defm V_COS_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">;
879-
defm V_SAT_PK_U8_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
886+
defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
887+
defm V_TRUNC_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">;
888+
defm V_RNDNE_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">;
889+
defm V_FRACT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">;
890+
defm V_SIN_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">;
891+
defm V_COS_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">;
892+
defm V_SAT_PK_U8_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
880893
defm V_CVT_NORM_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">;
881894
defm V_CVT_NORM_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">;
882895

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir

Lines changed: 68 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
2+
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
3+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
4+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s
35

46
---
57
name: fceil_s16_ss
@@ -36,12 +38,28 @@ body: |
3638
bb.0:
3739
liveins: $vgpr0
3840
39-
; GCN-LABEL: name: fceil_s16_vv
40-
; GCN: liveins: $vgpr0
41-
; GCN-NEXT: {{ $}}
42-
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
43-
; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
44-
; GCN-NEXT: $vgpr0 = COPY %2
41+
; GFX8-LABEL: name: fceil_s16_vv
42+
; GFX8: liveins: $vgpr0
43+
; GFX8-NEXT: {{ $}}
44+
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
45+
; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
46+
; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
47+
;
48+
; GFX11-LABEL: name: fceil_s16_vv
49+
; GFX11: liveins: $vgpr0
50+
; GFX11-NEXT: {{ $}}
51+
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
52+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
53+
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
54+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
55+
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
56+
;
57+
; GFX11-FAKE16-LABEL: name: fceil_s16_vv
58+
; GFX11-FAKE16: liveins: $vgpr0
59+
; GFX11-FAKE16-NEXT: {{ $}}
60+
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
61+
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
62+
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_fake16_e64_]]
4563
%0:vgpr(s32) = COPY $vgpr0
4664
%1:vgpr(s16) = G_TRUNC %0
4765
%2:vgpr(s16) = G_FCEIL %1
@@ -59,12 +77,27 @@ body: |
5977
bb.0:
6078
liveins: $sgpr0
6179
62-
; GCN-LABEL: name: fceil_s16_vs
63-
; GCN: liveins: $sgpr0
64-
; GCN-NEXT: {{ $}}
65-
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
66-
; GCN-NEXT: %2:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
67-
; GCN-NEXT: $vgpr0 = COPY %2
80+
; GFX8-LABEL: name: fceil_s16_vs
81+
; GFX8: liveins: $sgpr0
82+
; GFX8-NEXT: {{ $}}
83+
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
84+
; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
85+
; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
86+
;
87+
; GFX11-LABEL: name: fceil_s16_vs
88+
; GFX11: liveins: $sgpr0
89+
; GFX11-NEXT: {{ $}}
90+
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
91+
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
92+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
93+
; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
94+
;
95+
; GFX11-FAKE16-LABEL: name: fceil_s16_vs
96+
; GFX11-FAKE16: liveins: $sgpr0
97+
; GFX11-FAKE16-NEXT: {{ $}}
98+
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
99+
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
100+
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_fake16_e64_]]
68101
%0:sgpr(s32) = COPY $sgpr0
69102
%1:sgpr(s16) = G_TRUNC %0
70103
%2:vgpr(s16) = G_FCEIL %1
@@ -82,12 +115,28 @@ body: |
82115
bb.0:
83116
liveins: $vgpr0
84117
85-
; GCN-LABEL: name: fceil_fneg_s16_vv
86-
; GCN: liveins: $vgpr0
87-
; GCN-NEXT: {{ $}}
88-
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
89-
; GCN-NEXT: %3:vgpr_32 = nofpexcept V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
90-
; GCN-NEXT: $vgpr0 = COPY %3
118+
; GFX8-LABEL: name: fceil_fneg_s16_vv
119+
; GFX8: liveins: $vgpr0
120+
; GFX8-NEXT: {{ $}}
121+
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
122+
; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
123+
; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
124+
;
125+
; GFX11-LABEL: name: fceil_fneg_s16_vv
126+
; GFX11: liveins: $vgpr0
127+
; GFX11-NEXT: {{ $}}
128+
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
129+
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
130+
; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
131+
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
132+
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
133+
;
134+
; GFX11-FAKE16-LABEL: name: fceil_fneg_s16_vv
135+
; GFX11-FAKE16: liveins: $vgpr0
136+
; GFX11-FAKE16-NEXT: {{ $}}
137+
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
138+
; GFX11-FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
139+
; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CEIL_F16_fake16_e64_]]
91140
%0:vgpr(s32) = COPY $vgpr0
92141
%1:vgpr(s16) = G_TRUNC %0
93142
%2:vgpr(s16) = G_FNEG %1

0 commit comments

Comments
 (0)