Skip to content

Commit c496176

Browse files
committed
[AMDGPU] Fix canonicalization of truncated values.
We were relying on roundings to implicitly canonicalize, which is generally safe, except with roundings that may be optimized away. Fixes #82937.
1 parent a3748d6 commit c496176

File tree

7 files changed

+103
-14
lines changed

7 files changed

+103
-14
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26258,6 +26258,24 @@ SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
2625826258
if (N0->getOpcode() == ISD::FP16_TO_FP)
2625926259
return N0->getOperand(0);
2626026260

26261+
// fold (fp_to_fp16 (freeze (fp16_to_fp (fp_to_fp16 op))))
26262+
// -> (fp_to_fp16 (freeze op))
26263+
if (N0->getOpcode() == ISD::FREEZE) {
26264+
if (auto fp16_to_fp = N0->getOperand(0);
26265+
fp16_to_fp->getOpcode() == ISD::FP16_TO_FP) {
26266+
if (auto new_fp16_to_fp = visitFP16_TO_FP(fp16_to_fp.getNode()))
26267+
if (new_fp16_to_fp->getOpcode() == ISD::FP16_TO_FP)
26268+
fp16_to_fp = new_fp16_to_fp;
26269+
if (auto fp_to_fp16 = fp16_to_fp->getOperand(0);
26270+
fp_to_fp16->getOpcode() == ISD::FP_TO_FP16) {
26271+
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
26272+
DAG.getNode(N0->getOpcode(), SDLoc(N0),
26273+
N0.getValueType(),
26274+
fp_to_fp16->getOperand(0)));
26275+
}
26276+
}
26277+
}
26278+
2626126279
return SDValue();
2626226280
}
2626326281

@@ -26286,6 +26304,24 @@ SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
2628626304
if (N0->getOpcode() == ISD::BF16_TO_FP)
2628726305
return N0->getOperand(0);
2628826306

26307+
// fold (fp_to_bf16 (freeze (fp16_to_fp (fp_to_bf16 op))))
26308+
// -> (fp_to_bf16 (freeze op))
26309+
if (N0->getOpcode() == ISD::FREEZE) {
26310+
if (auto bf16_to_fp = N0->getOperand(0);
26311+
bf16_to_fp->getOpcode() == ISD::BF16_TO_FP) {
26312+
if (auto new_bf16_to_fp = visitBF16_TO_FP(bf16_to_fp.getNode()))
26313+
if (new_bf16_to_fp->getOpcode() == ISD::BF16_TO_FP)
26314+
bf16_to_fp = new_bf16_to_fp;
26315+
if (auto fp_to_bf16 = bf16_to_fp->getOperand(0);
26316+
fp_to_bf16->getOpcode() == ISD::FP_TO_BF16) {
26317+
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
26318+
DAG.getNode(N0->getOpcode(), SDLoc(N0),
26319+
N0.getValueType(),
26320+
fp_to_bf16->getOperand(0)));
26321+
}
26322+
}
26323+
}
26324+
2628926325
return SDValue();
2629026326
}
2629126327

llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2398,6 +2398,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
23982398
case ISD::FCOPYSIGN: R = PromoteFloatRes_FCOPYSIGN(N); break;
23992399

24002400
// Unary FP Operations
2401+
case ISD::FREEZE:
24012402
case ISD::FABS:
24022403
case ISD::FCBRT:
24032404
case ISD::FCEIL:

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12416,7 +12416,7 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
1241612416
}
1241712417

1241812418
bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
12419-
unsigned MaxDepth) const {
12419+
bool &Trunc, unsigned MaxDepth) const {
1242012420
unsigned Opcode = Op.getOpcode();
1242112421
if (Opcode == ISD::FCANONICALIZE)
1242212422
return true;
@@ -12450,7 +12450,6 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1245012450
case ISD::FSQRT:
1245112451
case ISD::FDIV:
1245212452
case ISD::FREM:
12453-
case ISD::FP_ROUND:
1245412453
case ISD::FP_EXTEND:
1245512454
case ISD::FLDEXP:
1245612455
case AMDGPUISD::FMUL_LEGACY:
@@ -12473,12 +12472,22 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1247312472
case AMDGPUISD::CVT_F32_UBYTE3:
1247412473
return true;
1247512474

12475+
case ISD::FP_ROUND:
12476+
if (Op.getConstantOperandVal(1))
12477+
Trunc = true;
12478+
return true;
12479+
12480+
case ISD::FREEZE:
12481+
// FREEZE is used as an optimization barrier; we can ignore any TRUNC in its
12482+
// input.
12483+
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12484+
1247612485
// It can/will be lowered or combined as a bit operation.
1247712486
// Need to check their input recursively to handle.
1247812487
case ISD::FNEG:
1247912488
case ISD::FABS:
1248012489
case ISD::FCOPYSIGN:
12481-
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12490+
return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
1248212491

1248312492
case ISD::FSIN:
1248412493
case ISD::FCOS:
@@ -12513,47 +12522,48 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1251312522

1251412523
// FIXME: Does this apply with clamp? It's implemented with max.
1251512524
for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12516-
if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12525+
if (!isCanonicalized(DAG, Op.getOperand(I), Trunc, MaxDepth - 1))
1251712526
return false;
1251812527
}
1251912528

1252012529
return true;
1252112530
}
1252212531
case ISD::SELECT: {
12523-
return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12524-
isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12532+
return isCanonicalized(DAG, Op.getOperand(1), Trunc, MaxDepth - 1) &&
12533+
isCanonicalized(DAG, Op.getOperand(2), Trunc, MaxDepth - 1);
1252512534
}
1252612535
case ISD::BUILD_VECTOR: {
1252712536
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
1252812537
SDValue SrcOp = Op.getOperand(i);
12529-
if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12538+
if (!isCanonicalized(DAG, SrcOp, Trunc, MaxDepth - 1))
1253012539
return false;
1253112540
}
1253212541

1253312542
return true;
1253412543
}
1253512544
case ISD::EXTRACT_VECTOR_ELT:
1253612545
case ISD::EXTRACT_SUBVECTOR: {
12537-
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12546+
return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
1253812547
}
1253912548
case ISD::INSERT_VECTOR_ELT: {
12540-
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12541-
isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12549+
return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1) &&
12550+
isCanonicalized(DAG, Op.getOperand(1), Trunc, MaxDepth - 1);
1254212551
}
1254312552
case ISD::UNDEF:
1254412553
// Could be anything.
1254512554
return false;
1254612555

1254712556
case ISD::BITCAST:
12548-
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12557+
return isCanonicalized(DAG, Op.getOperand(0), Trunc, MaxDepth - 1);
1254912558
case ISD::TRUNCATE: {
1255012559
// Hack round the mess we make when legalizing extract_vector_elt
1255112560
if (Op.getValueType() == MVT::i16) {
1255212561
SDValue TruncSrc = Op.getOperand(0);
1255312562
if (TruncSrc.getValueType() == MVT::i32 &&
1255412563
TruncSrc.getOpcode() == ISD::BITCAST &&
1255512564
TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12556-
return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12565+
return isCanonicalized(DAG, TruncSrc.getOperand(0), Trunc,
12566+
MaxDepth - 1);
1255712567
}
1255812568
}
1255912569
return false;
@@ -12831,7 +12841,10 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
1283112841
}
1283212842
}
1283312843

12834-
return isCanonicalized(DAG, N0) ? N0 : SDValue();
12844+
bool Trunc = false;
12845+
return isCanonicalized(DAG, N0, Trunc)
12846+
? Trunc ? DAG.getNode(ISD::FREEZE, SDLoc(N), VT, N0) : N0
12847+
: SDValue();
1283512848
}
1283612849

1283712850
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
516516
Register N1) const override;
517517

518518
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
519+
unsigned MaxDepth = 5) const {
520+
bool Trunc;
521+
return isCanonicalized(DAG, Op, Trunc, MaxDepth);
522+
}
523+
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, bool &Trunc,
519524
unsigned MaxDepth = 5) const;
520525
bool isCanonicalized(Register Reg, MachineFunction &MF,
521526
unsigned MaxDepth = 5) const;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26818,11 +26818,15 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
2681826818
; GCN-LABEL: v_canonicalize_bf16:
2681926819
; GCN: ; %bb.0:
2682026820
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26821+
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
26822+
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2682126823
; GCN-NEXT: s_setpc_b64 s[30:31]
2682226824
;
2682326825
; GFX7-LABEL: v_canonicalize_bf16:
2682426826
; GFX7: ; %bb.0:
2682526827
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26828+
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
26829+
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
2682626830
; GFX7-NEXT: s_setpc_b64 s[30:31]
2682726831
;
2682826832
; GFX8-LABEL: v_canonicalize_bf16:

llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
170170
ret void
171171
}
172172

173+
define half @s_test_canonicalize_arg(half %x) #1 {
174+
; VI-LABEL: s_test_canonicalize_arg:
175+
; VI: ; %bb.0:
176+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177+
; VI-NEXT: v_max_f16_e32 v0, v0, v0
178+
; VI-NEXT: s_setpc_b64 s[30:31]
179+
;
180+
; GFX9-LABEL: s_test_canonicalize_arg:
181+
; GFX9: ; %bb.0:
182+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183+
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
184+
; GFX9-NEXT: s_setpc_b64 s[30:31]
185+
;
186+
; CI-LABEL: s_test_canonicalize_arg:
187+
; CI: ; %bb.0:
188+
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189+
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
190+
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
191+
; CI-NEXT: s_setpc_b64 s[30:31]
192+
;
193+
; GFX11-LABEL: s_test_canonicalize_arg:
194+
; GFX11: ; %bb.0:
195+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196+
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
197+
; GFX11-NEXT: s_setpc_b64 s[30:31]
198+
%canonicalized = call half @llvm.canonicalize.f16(half %x)
199+
ret half %canonicalized
200+
}
201+
173202
define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
174203
; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
175204
; VI: ; %bb.0:

llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3980,7 +3980,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
39803980
; SI-LABEL: v_fneg_canonicalize_f16:
39813981
; SI: ; %bb.0:
39823982
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3983-
; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
3983+
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
3984+
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
39843985
; SI-NEXT: s_setpc_b64 s[30:31]
39853986
;
39863987
; VI-LABEL: v_fneg_canonicalize_f16:

0 commit comments

Comments
 (0)