Skip to content

Commit 85ecbf7

Browse files
committed
AMDGPU: Custom lower vector fptrunc of f32 -> f16
GFx950+ supports v_cvt_pk_f16_f32. However current implementation of vector fptrunc lowering fully scalarizes the vector, and the scalar conversions may not always be combined to generate the packed one. We made v2f32 -> v2f16 legal in llvm#139956. This work is an extension to handle wider vectors. Instead of fully scalarization, we split the vector to packs (v2f32 -> v2f16) to ensure the packed conversion can always been generated.
1 parent 12600eb commit 85ecbf7

File tree

5 files changed

+172
-14
lines changed

5 files changed

+172
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,9 +1061,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
10611061
}
10621062

10631063
auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1064-
if (ST.hasCvtPkF16F32Inst())
1065-
FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}});
1066-
else
1064+
if (ST.hasCvtPkF16F32Inst()) {
1065+
FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1066+
.customFor({{V4S16, V4S32}, {V8S16, V8S32}});
1067+
} else
10671068
FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
10681069
FPTruncActions.scalarize(0).lower();
10691070

@@ -2163,6 +2164,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
21632164
case TargetOpcode::G_FMINNUM_IEEE:
21642165
case TargetOpcode::G_FMAXNUM_IEEE:
21652166
return legalizeMinNumMaxNum(Helper, MI);
2167+
case TargetOpcode::G_FPTRUNC:
2168+
return legalizeFPTrunc(Helper, MI, MRI);
21662169
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
21672170
return legalizeExtractVectorElt(MI, MRI, B);
21682171
case TargetOpcode::G_INSERT_VECTOR_ELT:
@@ -2749,6 +2752,20 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
27492752
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
27502753
}
27512754

2755+
bool AMDGPULegalizerInfo::legalizeFPTrunc(LegalizerHelper &Helper,
2756+
MachineInstr &MI,
2757+
MachineRegisterInfo &MRI) const {
2758+
Register DstReg = MI.getOperand(0).getReg();
2759+
LLT DstTy = MRI.getType(DstReg);
2760+
assert (DstTy.isVector() && DstTy.getNumElements() > 2);
2761+
LLT EltTy = DstTy.getElementType();
2762+
assert (EltTy == S16 && "Only handle vectors of half");
2763+
2764+
// Split vector to packs.
2765+
return Helper.fewerElementsVector(MI, 0, LLT::fixed_vector(2, EltTy)) ==
2766+
LegalizerHelper::Legalized;
2767+
}
2768+
27522769
bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
27532770
MachineInstr &MI, MachineRegisterInfo &MRI,
27542771
MachineIRBuilder &B) const {

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
5656
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI,
5757
MachineIRBuilder &B, bool Signed) const;
5858
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const;
59+
bool legalizeFPTrunc(LegalizerHelper &Helper, MachineInstr &MI,
60+
MachineRegisterInfo &MRI) const;
5961
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
6062
MachineIRBuilder &B) const;
6163
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -919,8 +919,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
919919
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
920920
}
921921

922-
if (Subtarget->hasCvtPkF16F32Inst())
923-
setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
922+
if (Subtarget->hasCvtPkF16F32Inst()) {
923+
setOperationAction(ISD::FP_ROUND, {MVT::v2f16, MVT::v4f16, MVT::v8f16},
924+
Custom);
925+
}
924926

925927
setTargetDAGCombine({ISD::ADD,
926928
ISD::UADDO_CARRY,
@@ -6900,14 +6902,44 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
69006902
DAG.getTargetConstant(0, DL, MVT::i32));
69016903
}
69026904

6905+
SDValue SITargetLowering::SplitFP_ROUNDVectorToPacks(SDValue Op,
6906+
SelectionDAG &DAG) const {
6907+
unsigned Opc = Op.getOpcode();
6908+
EVT DstVT = Op.getValueType();
6909+
unsigned NumElts = DstVT.getVectorNumElements();
6910+
assert (NumElts % 2 == 0 && "Only handle vectors of even number of elements");
6911+
if (NumElts == 2) // already packed.
6912+
return Op;
6913+
6914+
SDValue Src = Op.getOperand(0);
6915+
EVT SrcVT = Src.getValueType();
6916+
LLVMContext &Context = *DAG.getContext();
6917+
EVT SrcPkVT = EVT::getVectorVT(Context, SrcVT.getScalarType(), 2);
6918+
EVT DstPkVT = EVT::getVectorVT(Context, DstVT.getScalarType(), 2);
6919+
6920+
SDLoc DL(Op);
6921+
SmallVector<SDValue, 16> Packs;
6922+
for (unsigned Index = 0; Index < NumElts; Index +=2) {
6923+
SDValue PkSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SrcPkVT, Src,
6924+
DAG.getConstant(Index, DL, MVT::i32));
6925+
SDValue PkDst = DAG.getNode(Opc, DL, DstPkVT, PkSrc,
6926+
DAG.getTargetConstant(0, DL, MVT::i32));
6927+
Packs.push_back(PkDst);
6928+
}
6929+
6930+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Packs);
6931+
}
6932+
69036933
SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
69046934
SDValue Src = Op.getOperand(0);
69056935
EVT SrcVT = Src.getValueType();
69066936
EVT DstVT = Op.getValueType();
69076937

6908-
if (DstVT == MVT::v2f16) {
6938+
if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
69096939
assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
6910-
return SrcVT == MVT::v2f32 ? Op : SDValue();
6940+
if (SrcVT.getScalarType() != MVT::f32)
6941+
return SDValue();
6942+
return SplitFP_ROUNDVectorToPacks(Op, DAG);
69116943
}
69126944

69136945
if (SrcVT.getScalarType() != MVT::f64)

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
145145

146146
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
147147
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
148+
SDValue SplitFP_ROUNDVectorToPacks(SDValue Op, SelectionDAG &DAG) const;
148149
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
149150
SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
150151
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;

llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll

Lines changed: 113 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,124 @@ define <2 x half> @v_test_cvt_v2f32_v2f16(<2 x float> %src) {
1212
ret <2 x half> %res
1313
}
1414

15-
define half @fptrunc_v2f32_v2f16_then_extract(<2 x float> %src) {
16-
; GFX950-LABEL: fptrunc_v2f32_v2f16_then_extract:
15+
define <4 x half> @v_test_cvt_v4f32_v4f16(<4 x float> %src) {
16+
; GFX950-LABEL: v_test_cvt_v4f32_v4f16:
1717
; GFX950: ; %bb.0:
1818
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1919
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
20-
; GFX950-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20+
; GFX950-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
21+
; GFX950-NEXT: s_setpc_b64 s[30:31]
22+
%res = fptrunc <4 x float> %src to <4 x half>
23+
ret <4 x half> %res
24+
}
25+
26+
define <8 x half> @v_test_cvt_v8f32_v2f16(<8 x float> %src) {
27+
; GFX950-LABEL: v_test_cvt_v8f32_v2f16:
28+
; GFX950: ; %bb.0:
29+
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30+
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
31+
; GFX950-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
32+
; GFX950-NEXT: v_cvt_pk_f16_f32 v2, v4, v5
33+
; GFX950-NEXT: v_cvt_pk_f16_f32 v3, v6, v7
34+
; GFX950-NEXT: s_setpc_b64 s[30:31]
35+
%res = fptrunc <8 x float> %src to <8 x half>
36+
ret <8 x half> %res
37+
}
38+
39+
define half @fptrunc_v2f32_v2f16_extract_uses(<2 x float> %src) {
40+
; GFX950-LABEL: fptrunc_v2f32_v2f16_extract_uses:
41+
; GFX950: ; %bb.0:
42+
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43+
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
44+
; GFX950-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2145
; GFX950-NEXT: s_setpc_b64 s[30:31]
2246
%vec_half = fptrunc <2 x float> %src to <2 x half>
23-
%first = extractelement <2 x half> %vec_half, i64 1
24-
%second = extractelement <2 x half> %vec_half, i64 0
25-
%res = fadd half %first, %second
26-
ret half %res
47+
%f0 = extractelement <2 x half> %vec_half, i64 0
48+
%f1 = extractelement <2 x half> %vec_half, i64 1
49+
%rslt = fadd half %f0, %f1
50+
ret half %rslt
51+
}
52+
53+
define half @fptrunc_v4f32_v4f16_extract_uses(<4 x float> %vec_float) {
54+
; GFX950-SDAG-LABEL: fptrunc_v4f32_v4f16_extract_uses:
55+
; GFX950-SDAG: ; %bb.0:
56+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v2, v2, v3
58+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
59+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
60+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
61+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
62+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
63+
;
64+
; GFX950-GISEL-LABEL: fptrunc_v4f32_v4f16_extract_uses:
65+
; GFX950-GISEL: ; %bb.0:
66+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
68+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
69+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
70+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
71+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
72+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
73+
%vec_half = fptrunc <4 x float> %vec_float to <4 x half>
74+
%f0 = extractelement <4 x half> %vec_half, i64 0
75+
%f1 = extractelement <4 x half> %vec_half, i64 1
76+
%f2 = extractelement <4 x half> %vec_half, i64 2
77+
%f3 = extractelement <4 x half> %vec_half, i64 3
78+
%sum0 = fadd half %f0, %f1
79+
%sum1 = fadd half %f2, %f3
80+
%rslt = fadd half %sum0, %sum1
81+
ret half %rslt
82+
}
83+
84+
define half @fptrunc_v8f32_v8f16_extract_uses(<8 x float> %vec_float) {
85+
; GFX950-SDAG-LABEL: fptrunc_v8f32_v8f16_extract_uses:
86+
; GFX950-SDAG: ; %bb.0:
87+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v6, v6, v7
89+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v4, v4, v5
90+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v2, v2, v3
91+
; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
92+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
93+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
94+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
95+
; GFX950-SDAG-NEXT: v_add_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
96+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
97+
; GFX950-SDAG-NEXT: v_add_f16_e32 v1, v2, v3
98+
; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
99+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
100+
;
101+
; GFX950-GISEL-LABEL: fptrunc_v8f32_v8f16_extract_uses:
102+
; GFX950-GISEL: ; %bb.0:
103+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
105+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
106+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v2, v4, v5
107+
; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v3, v6, v7
108+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
109+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
110+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
111+
; GFX950-GISEL-NEXT: v_add_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
112+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
113+
; GFX950-GISEL-NEXT: v_add_f16_e32 v1, v2, v3
114+
; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
115+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
116+
%vec_half = fptrunc <8 x float> %vec_float to <8 x half>
117+
%f0 = extractelement <8 x half> %vec_half, i64 0
118+
%f1 = extractelement <8 x half> %vec_half, i64 1
119+
%f2 = extractelement <8 x half> %vec_half, i64 2
120+
%f3 = extractelement <8 x half> %vec_half, i64 3
121+
%f4 = extractelement <8 x half> %vec_half, i64 4
122+
%f5 = extractelement <8 x half> %vec_half, i64 5
123+
%f6 = extractelement <8 x half> %vec_half, i64 6
124+
%f7 = extractelement <8 x half> %vec_half, i64 7
125+
%sum0 = fadd half %f0, %f1
126+
%sum1 = fadd half %f2, %f3
127+
%sum2 = fadd half %f4, %f5
128+
%sum3 = fadd half %f6, %f7
129+
%sum4 = fadd half %sum0, %sum1
130+
%sum5 = fadd half %sum2, %sum3
131+
%rslt = fadd half %sum4, %sum5
132+
ret half %rslt
27133
}
28134

29135
define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) {

0 commit comments

Comments
 (0)