Skip to content

Commit ecfdc23

Browse files
authored
[AMDGPU] Select gfx1150 SALU Float instructions (llvm#66885)
1 parent 53a2923 commit ecfdc23

23 files changed

+4282
-157
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
663663
case ISD::BRCOND:
664664
SelectBRCOND(N);
665665
return;
666+
case ISD::FP_EXTEND:
667+
SelectFP_EXTEND(N);
668+
return;
666669
case AMDGPUISD::CVT_PKRTZ_F16_F32:
667670
case AMDGPUISD::CVT_PKNORM_I16_F32:
668671
case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -2303,6 +2306,22 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
23032306
VCC.getValue(0));
23042307
}
23052308

2309+
void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2310+
if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2311+
!N->isDivergent()) {
2312+
SDValue Src = N->getOperand(0);
2313+
if (Src.getValueType() == MVT::f16) {
2314+
if (isExtractHiElt(Src, Src)) {
2315+
CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2316+
{Src});
2317+
return;
2318+
}
2319+
}
2320+
}
2321+
2322+
SelectCode(N);
2323+
}
2324+
23062325
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
23072326
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
23082327
// be copied to an SGPR with readfirstlane.

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
273273
bool isCBranchSCC(const SDNode *N) const;
274274
void SelectBRCOND(SDNode *N);
275275
void SelectFMAD_FMA(SDNode *N);
276+
void SelectFP_EXTEND(SDNode *N);
276277
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
277278
void SelectDSBvhStackIntrinsic(SDNode *N);
278279
void SelectDS_GWS(SDNode *N, unsigned IntrID);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 139 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,36 +1211,104 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
12111211
}
12121212
}
12131213

1214-
if (Size != 32)
1215-
return -1;
1214+
if (Size == 32) {
1215+
switch (P) {
1216+
case CmpInst::ICMP_NE:
1217+
return AMDGPU::S_CMP_LG_U32;
1218+
case CmpInst::ICMP_EQ:
1219+
return AMDGPU::S_CMP_EQ_U32;
1220+
case CmpInst::ICMP_SGT:
1221+
return AMDGPU::S_CMP_GT_I32;
1222+
case CmpInst::ICMP_SGE:
1223+
return AMDGPU::S_CMP_GE_I32;
1224+
case CmpInst::ICMP_SLT:
1225+
return AMDGPU::S_CMP_LT_I32;
1226+
case CmpInst::ICMP_SLE:
1227+
return AMDGPU::S_CMP_LE_I32;
1228+
case CmpInst::ICMP_UGT:
1229+
return AMDGPU::S_CMP_GT_U32;
1230+
case CmpInst::ICMP_UGE:
1231+
return AMDGPU::S_CMP_GE_U32;
1232+
case CmpInst::ICMP_ULT:
1233+
return AMDGPU::S_CMP_LT_U32;
1234+
case CmpInst::ICMP_ULE:
1235+
return AMDGPU::S_CMP_LE_U32;
1236+
case CmpInst::FCMP_OEQ:
1237+
return AMDGPU::S_CMP_EQ_F32;
1238+
case CmpInst::FCMP_OGT:
1239+
return AMDGPU::S_CMP_GT_F32;
1240+
case CmpInst::FCMP_OGE:
1241+
return AMDGPU::S_CMP_GE_F32;
1242+
case CmpInst::FCMP_OLT:
1243+
return AMDGPU::S_CMP_LT_F32;
1244+
case CmpInst::FCMP_OLE:
1245+
return AMDGPU::S_CMP_LE_F32;
1246+
case CmpInst::FCMP_ONE:
1247+
return AMDGPU::S_CMP_LG_F32;
1248+
case CmpInst::FCMP_ORD:
1249+
return AMDGPU::S_CMP_O_F32;
1250+
case CmpInst::FCMP_UNO:
1251+
return AMDGPU::S_CMP_U_F32;
1252+
case CmpInst::FCMP_UEQ:
1253+
return AMDGPU::S_CMP_NLG_F32;
1254+
case CmpInst::FCMP_UGT:
1255+
return AMDGPU::S_CMP_NLE_F32;
1256+
case CmpInst::FCMP_UGE:
1257+
return AMDGPU::S_CMP_NLT_F32;
1258+
case CmpInst::FCMP_ULT:
1259+
return AMDGPU::S_CMP_NGE_F32;
1260+
case CmpInst::FCMP_ULE:
1261+
return AMDGPU::S_CMP_NGT_F32;
1262+
case CmpInst::FCMP_UNE:
1263+
return AMDGPU::S_CMP_NEQ_F32;
1264+
default:
1265+
llvm_unreachable("Unknown condition code!");
1266+
}
1267+
}
12161268

1217-
switch (P) {
1218-
case CmpInst::ICMP_NE:
1219-
return AMDGPU::S_CMP_LG_U32;
1220-
case CmpInst::ICMP_EQ:
1221-
return AMDGPU::S_CMP_EQ_U32;
1222-
case CmpInst::ICMP_SGT:
1223-
return AMDGPU::S_CMP_GT_I32;
1224-
case CmpInst::ICMP_SGE:
1225-
return AMDGPU::S_CMP_GE_I32;
1226-
case CmpInst::ICMP_SLT:
1227-
return AMDGPU::S_CMP_LT_I32;
1228-
case CmpInst::ICMP_SLE:
1229-
return AMDGPU::S_CMP_LE_I32;
1230-
case CmpInst::ICMP_UGT:
1231-
return AMDGPU::S_CMP_GT_U32;
1232-
case CmpInst::ICMP_UGE:
1233-
return AMDGPU::S_CMP_GE_U32;
1234-
case CmpInst::ICMP_ULT:
1235-
return AMDGPU::S_CMP_LT_U32;
1236-
case CmpInst::ICMP_ULE:
1237-
return AMDGPU::S_CMP_LE_U32;
1238-
default:
1239-
llvm_unreachable("Unknown condition code!");
1269+
if (Size == 16) {
1270+
if (!STI.hasSALUFloatInsts())
1271+
return -1;
1272+
1273+
switch (P) {
1274+
case CmpInst::FCMP_OEQ:
1275+
return AMDGPU::S_CMP_EQ_F16;
1276+
case CmpInst::FCMP_OGT:
1277+
return AMDGPU::S_CMP_GT_F16;
1278+
case CmpInst::FCMP_OGE:
1279+
return AMDGPU::S_CMP_GE_F16;
1280+
case CmpInst::FCMP_OLT:
1281+
return AMDGPU::S_CMP_LT_F16;
1282+
case CmpInst::FCMP_OLE:
1283+
return AMDGPU::S_CMP_LE_F16;
1284+
case CmpInst::FCMP_ONE:
1285+
return AMDGPU::S_CMP_LG_F16;
1286+
case CmpInst::FCMP_ORD:
1287+
return AMDGPU::S_CMP_O_F16;
1288+
case CmpInst::FCMP_UNO:
1289+
return AMDGPU::S_CMP_U_F16;
1290+
case CmpInst::FCMP_UEQ:
1291+
return AMDGPU::S_CMP_NLG_F16;
1292+
case CmpInst::FCMP_UGT:
1293+
return AMDGPU::S_CMP_NLE_F16;
1294+
case CmpInst::FCMP_UGE:
1295+
return AMDGPU::S_CMP_NLT_F16;
1296+
case CmpInst::FCMP_ULT:
1297+
return AMDGPU::S_CMP_NGE_F16;
1298+
case CmpInst::FCMP_ULE:
1299+
return AMDGPU::S_CMP_NGT_F16;
1300+
case CmpInst::FCMP_UNE:
1301+
return AMDGPU::S_CMP_NEQ_F16;
1302+
default:
1303+
llvm_unreachable("Unknown condition code!");
1304+
}
12401305
}
1306+
1307+
return -1;
12411308
}
12421309

1243-
bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1310+
bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1311+
12441312
MachineBasicBlock *BB = I.getParent();
12451313
const DebugLoc &DL = I.getDebugLoc();
12461314

@@ -1266,6 +1334,9 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
12661334
return Ret;
12671335
}
12681336

1337+
if (I.getOpcode() == AMDGPU::G_FCMP)
1338+
return false;
1339+
12691340
int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
12701341
if (Opcode == -1)
12711342
return false;
@@ -2439,6 +2510,42 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
24392510
return false;
24402511
}
24412512

2513+
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
2514+
Register &Out) {
2515+
Register LShlSrc;
2516+
if (mi_match(In, MRI,
2517+
m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2518+
Out = LShlSrc;
2519+
return true;
2520+
}
2521+
return false;
2522+
}
2523+
2524+
bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2525+
if (!Subtarget->hasSALUFloatInsts())
2526+
return false;
2527+
2528+
Register Dst = I.getOperand(0).getReg();
2529+
const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2530+
if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2531+
return false;
2532+
2533+
Register Src = I.getOperand(1).getReg();
2534+
2535+
if (MRI->getType(Dst) == LLT::scalar(32) &&
2536+
MRI->getType(Src) == LLT::scalar(16)) {
2537+
if (isExtractHiElt(*MRI, Src, Src)) {
2538+
MachineBasicBlock *BB = I.getParent();
2539+
BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2540+
.addUse(Src);
2541+
I.eraseFromParent();
2542+
return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2543+
}
2544+
}
2545+
2546+
return false;
2547+
}
2548+
24422549
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
24432550
MachineBasicBlock *BB = I.getParent();
24442551
MachineOperand &ImmOp = I.getOperand(1);
@@ -3471,7 +3578,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
34713578
case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
34723579
return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
34733580
case TargetOpcode::G_ICMP:
3474-
if (selectG_ICMP(I))
3581+
case TargetOpcode::G_FCMP:
3582+
if (selectG_ICMP_or_FCMP(I))
34753583
return true;
34763584
return selectImpl(I, *CoverageInfo);
34773585
case TargetOpcode::G_LOAD:
@@ -3508,6 +3616,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
35083616
selectImpl(I, *CoverageInfo))
35093617
return true;
35103618
return selectG_SZA_EXT(I);
3619+
case TargetOpcode::G_FPEXT:
3620+
if (selectG_FPEXT(I))
3621+
return true;
3622+
return selectImpl(I, *CoverageInfo);
35113623
case TargetOpcode::G_BRCOND:
35123624
return selectG_BRCOND(I);
35133625
case TargetOpcode::G_GLOBAL_VALUE:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
9090
bool selectPHI(MachineInstr &I) const;
9191
bool selectG_TRUNC(MachineInstr &I) const;
9292
bool selectG_SZA_EXT(MachineInstr &I) const;
93+
bool selectG_FPEXT(MachineInstr &I) const;
9394
bool selectG_CONSTANT(MachineInstr &I) const;
9495
bool selectG_FNEG(MachineInstr &I) const;
9596
bool selectG_FABS(MachineInstr &I) const;
@@ -129,7 +130,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
129130
const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
130131
bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const;
131132
int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
132-
bool selectG_ICMP(MachineInstr &I) const;
133+
bool selectG_ICMP_or_FCMP(MachineInstr &I) const;
133134
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
134135
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
135136
SmallVectorImpl<GEPInfo> &AddrInfo) const;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,8 +1132,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
11321132
.scalarize(0)
11331133
.legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
11341134

1135-
getActionDefinitionsBuilder(G_FCMP)
1136-
.legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
1135+
auto &FCmpBuilder =
1136+
getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1137+
{S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1138+
1139+
if (ST.hasSALUFloatInsts())
1140+
FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1141+
1142+
FCmpBuilder
11371143
.widenScalarToNextPow2(1)
11381144
.clampScalar(1, S32, S64)
11391145
.scalarize(0);

0 commit comments

Comments
 (0)