Skip to content

Commit c897926

Browse files
committed
tmp
1 parent ae059a1 commit c897926

File tree

5 files changed

+737
-277
lines changed

5 files changed

+737
-277
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -161,18 +161,31 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
161161

162162
// TODO: Skip masking high bits if def is known boolean.
163163

164-
bool IsSGPR = TRI.isSGPRClass(SrcRC);
165-
unsigned AndOpc =
166-
IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167-
auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
168-
.addImm(1)
169-
.addReg(SrcReg);
170-
if (IsSGPR)
171-
And.setOperandDead(3); // Dead scc
172-
173-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
174-
.addImm(0)
175-
.addReg(MaskedReg);
164+
if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
165+
assert(Subtarget->useRealTrue16Insts());
166+
const int64_t NoMods = 0;
167+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
168+
.addImm(NoMods).addImm(1)
169+
.addImm(NoMods).addReg(SrcReg)
170+
.addImm(NoMods);
171+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
172+
.addImm(NoMods).addImm(0)
173+
.addImm(NoMods).addReg(MaskedReg)
174+
.addImm(NoMods);
175+
} else {
176+
bool IsSGPR = TRI.isSGPRClass(SrcRC);
177+
unsigned AndOpc =
178+
IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
179+
auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
180+
.addImm(1)
181+
.addReg(SrcReg);
182+
if (IsSGPR)
183+
And.setOperandDead(3); // Dead scc
184+
185+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
186+
.addImm(0)
187+
.addReg(MaskedReg);
188+
}
176189
}
177190

178191
if (!MRI->getRegClassOrNull(SrcReg))

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2030,6 +2030,8 @@ def : GCNPat <
20302030
>;
20312031

20322032
foreach fp16vt = [f16, bf16] in {
2033+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
2034+
let SubtargetPredicate = p in {
20332035
def : GCNPat <
20342036
(fabs (fp16vt VGPR_32:$src)),
20352037
(V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
@@ -2044,6 +2046,24 @@ def : GCNPat <
20442046
(fneg (fabs (fp16vt VGPR_32:$src))),
20452047
(V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
20462048
>;
2049+
}
2050+
2051+
let SubtargetPredicate = UseRealTrue16Insts in {
2052+
def : GCNPat <
2053+
(fabs (fp16vt VGPR_16:$src)),
2054+
(V_AND_B16_t16_e64 (i32 0), (i16 0x7fff), (i32 0), VGPR_16:$src)
2055+
>;
2056+
2057+
def : GCNPat <
2058+
(fneg (fp16vt VGPR_16:$src)),
2059+
(V_XOR_B16_t16_e64 (i32 0), (i16 0x8000), (i32 0), VGPR_16:$src)
2060+
>;
2061+
2062+
def : GCNPat <
2063+
(fneg (fabs (fp16vt VGPR_16:$src))),
2064+
(V_OR_B16_t16_e64 (i32 0), (i16 0x8000), (i32 0), VGPR_16:$src) // Set sign bit
2065+
>;
2066+
} // End SubtargetPredicate = UseRealTrue16Insts
20472067
} // End foreach fp16vt = ...
20482068

20492069
def : GCNPat <

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -922,18 +922,25 @@ def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> {
922922
let HasSrc1FloatMods = 0;
923923
let Src1ModSDWA = Int16SDWAInputMods;
924924
}
925-
def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
925+
def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_F16> {
926+
let Src1RC32 = RegisterOperand<VGPR_16_Lo128>;
927+
let Src1DPP = RegisterOperand<VGPR_16_Lo128>;
928+
let Src1ModDPP = IntT16VRegInputMods<0/*IsFake16*/>;
929+
}
930+
def LDEXP_F16_VOPProfile_Fake16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
926931
let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
927932
let Src1DPP = RegisterOperand<VGPR_32_Lo128>;
928-
let Src1ModDPP = IntT16VRegInputMods</* IsFake16= */ 1>;
933+
let Src1ModDPP = IntT16VRegInputMods<1/*IsFake16*/>;
929934
}
930935

931936
let isReMaterializable = 1 in {
932937
let FPDPRounding = 1 in {
933938
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in
934939
defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", LDEXP_F16_VOPProfile>;
935-
let SubtargetPredicate = HasTrue16BitInsts in
940+
let SubtargetPredicate = UseRealTrue16Insts in
936941
defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16>;
942+
let SubtargetPredicate = UseFakeTrue16Insts in
943+
defm V_LDEXP_F16_fake16 : VOP2Inst <"v_ldexp_f16_fake16", LDEXP_F16_VOPProfile_Fake16, null_frag, "v_ldexp_f16_fake16">;
937944
} // End FPDPRounding = 1
938945
// FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions
939946
defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
@@ -968,14 +975,27 @@ class LDEXP_F16_Pat <SDPatternOperator op, VOP_Pseudo inst, VOPProfile P = inst.
968975
let OtherPredicates = [NotHasTrue16BitInsts] in
969976
def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_e64>;
970977

971-
let OtherPredicates = [HasTrue16BitInsts] in
972-
def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>;
978+
class LDEXP_F16_t16_Pat <SDPatternOperator op, VOP_Pseudo inst, VOPProfile P = inst.Pfl> : GCNPat <
979+
(P.DstVT (op (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
980+
(i16 (VOP3Mods0 P.Src1VT:$src1, i32:$src1_modifiers)))),
981+
(inst $src0_modifiers, $src0,
982+
$src1_modifiers, $src1,
983+
$clamp, /* clamp */
984+
$omod, /* omod */
985+
0) /* op_sel */
986+
>;
987+
988+
let OtherPredicates = [UseRealTrue16Insts] in
989+
def : LDEXP_F16_t16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>;
990+
991+
let OtherPredicates = [UseFakeTrue16Insts] in
992+
def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_fake16_e64>;
973993

974994
let SubtargetPredicate = isGFX11Plus in {
975995
let isCommutable = 1 in {
976-
defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, and>;
977-
defm V_OR_B16_t16 : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, or>;
978-
defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, xor>;
996+
defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, and>;
997+
defm V_OR_B16_t16 : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, or>;
998+
defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, xor>;
979999
} // End isCommutable = 1
9801000
} // End SubtargetPredicate = isGFX11Plus
9811001

@@ -1714,6 +1734,7 @@ defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
17141734
defm V_MUL_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
17151735
defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
17161736
defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
1737+
defm V_LDEXP_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
17171738
defm V_MAX_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
17181739
defm V_MAX_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
17191740
defm V_MIN_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 71 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17180,11 +17180,17 @@ define bfloat @v_fabs_bf16(bfloat %a) {
1718017180
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
1718117181
; GFX10-NEXT: s_setpc_b64 s[30:31]
1718217182
;
17183-
; GFX11-LABEL: v_fabs_bf16:
17184-
; GFX11: ; %bb.0:
17185-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17186-
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17187-
; GFX11-NEXT: s_setpc_b64 s[30:31]
17183+
; GFX11TRUE16-LABEL: v_fabs_bf16:
17184+
; GFX11TRUE16: ; %bb.0:
17185+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17186+
; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
17187+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17188+
;
17189+
; GFX11FAKE16-LABEL: v_fabs_bf16:
17190+
; GFX11FAKE16: ; %bb.0:
17191+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17192+
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
17193+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
1718817194
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
1718917195
ret bfloat %op
1719017196
}
@@ -17266,11 +17272,17 @@ define bfloat @v_fneg_bf16(bfloat %a) {
1726617272
; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1726717273
; GFX10-NEXT: s_setpc_b64 s[30:31]
1726817274
;
17269-
; GFX11-LABEL: v_fneg_bf16:
17270-
; GFX11: ; %bb.0:
17271-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17272-
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17273-
; GFX11-NEXT: s_setpc_b64 s[30:31]
17275+
; GFX11TRUE16-LABEL: v_fneg_bf16:
17276+
; GFX11TRUE16: ; %bb.0:
17277+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17278+
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
17279+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17280+
;
17281+
; GFX11FAKE16-LABEL: v_fneg_bf16:
17282+
; GFX11FAKE16: ; %bb.0:
17283+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17284+
; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
17285+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
1727417286
%op = fneg bfloat %a
1727517287
ret bfloat %op
1727617288
}
@@ -17365,11 +17377,17 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
1736517377
; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
1736617378
; GFX10-NEXT: s_setpc_b64 s[30:31]
1736717379
;
17368-
; GFX11-LABEL: v_fneg_fabs_bf16:
17369-
; GFX11: ; %bb.0:
17370-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17371-
; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
17372-
; GFX11-NEXT: s_setpc_b64 s[30:31]
17380+
; GFX11TRUE16-LABEL: v_fneg_fabs_bf16:
17381+
; GFX11TRUE16: ; %bb.0:
17382+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17383+
; GFX11TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
17384+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
17385+
;
17386+
; GFX11FAKE16-LABEL: v_fneg_fabs_bf16:
17387+
; GFX11FAKE16: ; %bb.0:
17388+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17389+
; GFX11FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
17390+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
1737317391
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
1737417392
%op = fneg bfloat %fabs
1737517393
ret bfloat %op
@@ -34518,15 +34536,25 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
3451834536
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
3451934537
; GFX10-NEXT: s_setpc_b64 s[30:31]
3452034538
;
34521-
; GFX11-LABEL: v_select_fneg_lhs_bf16:
34522-
; GFX11: ; %bb.0:
34523-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34524-
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
34525-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34526-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34527-
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34528-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34529-
; GFX11-NEXT: s_setpc_b64 s[30:31]
34539+
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
34540+
; GFX11TRUE16: ; %bb.0:
34541+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34542+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
34543+
; GFX11TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l
34544+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34545+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34546+
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34547+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34548+
;
34549+
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
34550+
; GFX11FAKE16: ; %bb.0:
34551+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34552+
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34553+
; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
34554+
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34555+
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34556+
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34557+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
3453034558
%neg.a = fneg bfloat %a
3453134559
%op = select i1 %cond, bfloat %neg.a, bfloat %b
3453234560
ret bfloat %op
@@ -34582,15 +34610,25 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
3458234610
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
3458334611
; GFX10-NEXT: s_setpc_b64 s[30:31]
3458434612
;
34585-
; GFX11-LABEL: v_select_fneg_rhs_bf16:
34586-
; GFX11: ; %bb.0:
34587-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34588-
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
34589-
; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34590-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34591-
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34592-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34593-
; GFX11-NEXT: s_setpc_b64 s[30:31]
34613+
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
34614+
; GFX11TRUE16: ; %bb.0:
34615+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34616+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
34617+
; GFX11TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v2.l
34618+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34619+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34620+
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34621+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
34622+
;
34623+
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
34624+
; GFX11FAKE16: ; %bb.0:
34625+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34626+
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
34627+
; GFX11FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
34628+
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
34629+
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
34630+
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
34631+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
3459434632
%neg.b = fneg bfloat %b
3459534633
%op = select i1 %cond, bfloat %a, bfloat %neg.b
3459634634
ret bfloat %op

0 commit comments

Comments
 (0)