Skip to content

Commit ddc0f1d

Browse files
committed
[TargetLowering] Actually add the adjustment to the significand
The logic was supposed to be choosing between {0, 1, -1} as an adjustment to the FP bit pattern. However, the adjustment itself was used as the bit pattern instead which result in garbage results.
1 parent 1c81b4a commit ddc0f1d

File tree

2 files changed

+39
-32
lines changed

2 files changed

+39
-32
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10895,28 +10895,31 @@ SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op,
1089510895
EVT ResultIntVTCCVT = getSetCCResultType(
1089610896
DAG.getDataLayout(), *DAG.getContext(), And.getValueType());
1089710897
SDValue Zero = DAG.getConstant(0, dl, ResultIntVT);
10898+
// The result is already odd so we don't need to do anything.
1089810899
SDValue AlreadyOdd = DAG.getSetCC(dl, ResultIntVTCCVT, And, Zero, ISD::SETNE);
1089910900

1090010901
EVT WideSetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
1090110902
AbsWide.getValueType());
10903+
// We keep results which are exact, odd or NaN.
1090210904
SDValue KeepNarrow =
1090310905
DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETUEQ);
1090410906
KeepNarrow = DAG.getNode(ISD::OR, dl, WideSetCCVT, KeepNarrow, AlreadyOdd);
10905-
// We morally performed a round-down if `abs_narrow` is smaller than
10906-
// `abs_wide`.
10907+
// We morally performed a round-down if AbsNarrow is smaller than
10908+
// AbsWide.
1090710909
SDValue NarrowIsRd =
1090810910
DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETOGT);
1090910911
// If the narrow value is odd or exact, pick it.
1091010912
// Otherwise, narrow is even and corresponds to either the rounded-up
1091110913
// or rounded-down value. If narrow is the rounded-down value, we want
1091210914
// the rounded-up value as it will be odd.
1091310915
SDValue Adjust = DAG.getSelect(dl, ResultIntVT, NarrowIsRd, One, NegativeOne);
10914-
Adjust = DAG.getSelect(dl, ResultIntVT, KeepNarrow, Zero, Adjust);
10916+
SDValue Adjusted = DAG.getNode(ISD::ADD, dl, ResultIntVT, NarrowBits, Adjust);
10917+
Op = DAG.getSelect(dl, ResultIntVT, KeepNarrow, NarrowBits, Adjusted);
1091510918
int ShiftAmount = BitSize - ResultVT.getScalarSizeInBits();
1091610919
SDValue ShiftCnst = DAG.getShiftAmountConstant(ShiftAmount, WideIntVT, dl);
1091710920
SignBit = DAG.getNode(ISD::SRL, dl, WideIntVT, SignBit, ShiftCnst);
1091810921
SignBit = DAG.getNode(ISD::TRUNCATE, dl, ResultIntVT, SignBit);
10919-
Op = DAG.getNode(ISD::OR, dl, ResultIntVT, Adjust, SignBit);
10922+
Op = DAG.getNode(ISD::OR, dl, ResultIntVT, Op, SignBit);
1092010923
return DAG.getNode(ISD::BITCAST, dl, ResultVT, Op);
1092110924
}
1092210925

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2281,13 +2281,14 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
22812281
; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
22822282
; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1
22832283
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2284-
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
2285-
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
2286-
; GFX8-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2287-
; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2288-
; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], vcc
2289-
; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
2290-
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
2284+
; GFX8-NEXT: v_and_b32_e32 v8, 1, v6
2285+
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
2286+
; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2287+
; GFX8-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2288+
; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[4:5]
2289+
; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v6, v4
2290+
; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc
2291+
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
22912292
; GFX8-NEXT: v_or_b32_e32 v5, v4, v7
22922293
; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1
22932294
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
@@ -2310,14 +2311,15 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
23102311
; GFX9-NEXT: s_waitcnt vmcnt(0)
23112312
; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
23122313
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2313-
; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
2314-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
2315-
; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
2314+
; GFX9-NEXT: v_and_b32_e32 v7, 1, v6
2315+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
23162316
; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
2317-
; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
2318-
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
2317+
; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
23192318
; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
2320-
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
2319+
; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
2320+
; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc
2321+
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
2322+
; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
23212323
; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4
23222324
; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1
23232325
; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9
@@ -2335,15 +2337,16 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
23352337
; GFX10-NEXT: s_waitcnt vmcnt(0)
23362338
; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
23372339
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2338-
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
2339-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
2340-
; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
2340+
; GFX10-NEXT: v_and_b32_e32 v7, 1, v6
2341+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
23412342
; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
2342-
; GFX10-NEXT: s_or_b32 s4, s4, vcc_lo
2343-
; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2343+
; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
23442344
; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5
2345-
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4
2345+
; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo
23462346
; GFX10-NEXT: s_mov_b32 s4, 0x400000
2347+
; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4
2348+
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2349+
; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
23472350
; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
23482351
; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1
23492352
; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
@@ -2360,23 +2363,24 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
23602363
; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
23612364
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
23622365
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
2363-
; GFX11-NEXT: v_and_b32_e32 v6, 1, v6
2364-
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
2366+
; GFX11-NEXT: v_and_b32_e32 v7, 1, v6
2367+
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
23652368
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2366-
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
23672369
; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
2368-
; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo
2369-
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
2370-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2370+
; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
23712371
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
2372-
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0
2372+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2373+
; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
23732374
; GFX11-NEXT: s_mov_b32 s0, 0x400000
2374-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2375+
; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
2376+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2377+
; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
2378+
; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
23752379
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
23762380
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
2381+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
23772382
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
23782383
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v5, s0
2379-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
23802384
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
23812385
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
23822386
; GFX11-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)