Skip to content

Commit 04a8696

Browse files
committed
[FPEnv] Fix chain handling for fpexcept.strict nodes
We need to ensure that fpexcept.strict nodes are not optimized away even if the result is unused. To do that, we need to chain them into the block's terminator nodes, like already done for PendingExcepts. This patch adds two new lists of pending chains, PendingConstrainedFP and PendingConstrainedFPStrict to hold constrained FP intrinsic nodes without and with fpexcept.strict markers. This allows not only to solve the above problem, but also to relax chains a bit further by no longer flushing all FP nodes before a store or other memory access. (They are still flushed before nodes with other side effects.) Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D72341
1 parent d7d88b9 commit 04a8696

10 files changed

+542
-471
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,6 +1025,8 @@ void SelectionDAGBuilder::clear() {
10251025
UnusedArgNodeMap.clear();
10261026
PendingLoads.clear();
10271027
PendingExports.clear();
1028+
PendingConstrainedFP.clear();
1029+
PendingConstrainedFPStrict.clear();
10281030
CurInst = nullptr;
10291031
HasTailCall = false;
10301032
SDNodeOrder = LowestSDNodeOrder;
@@ -1035,7 +1037,7 @@ void SelectionDAGBuilder::clearDanglingDebugInfo() {
10351037
DanglingDebugInfoMap.clear();
10361038
}
10371039

1038-
SDValue SelectionDAGBuilder::getRoot() {
1040+
SDValue SelectionDAGBuilder::getMemoryRoot() {
10391041
if (PendingLoads.empty())
10401042
return DAG.getRoot();
10411043

@@ -1053,9 +1055,31 @@ SDValue SelectionDAGBuilder::getRoot() {
10531055
return Root;
10541056
}
10551057

1058+
SDValue SelectionDAGBuilder::getRoot() {
1059+
// Chain up all pending constrained intrinsics together with all
1060+
// pending loads, by simply appending them to PendingLoads and
1061+
// then calling getMemoryRoot().
1062+
PendingLoads.reserve(PendingLoads.size() +
1063+
PendingConstrainedFP.size() +
1064+
PendingConstrainedFPStrict.size());
1065+
PendingLoads.append(PendingConstrainedFP.begin(),
1066+
PendingConstrainedFP.end());
1067+
PendingLoads.append(PendingConstrainedFPStrict.begin(),
1068+
PendingConstrainedFPStrict.end());
1069+
PendingConstrainedFP.clear();
1070+
PendingConstrainedFPStrict.clear();
1071+
return getMemoryRoot();
1072+
}
1073+
10561074
SDValue SelectionDAGBuilder::getControlRoot() {
10571075
SDValue Root = DAG.getRoot();
10581076

1077+
// We need to emit pending fpexcept.strict constrained intrinsics,
1078+
// so append them to the PendingExports list.
1079+
PendingExports.append(PendingConstrainedFPStrict.begin(),
1080+
PendingConstrainedFPStrict.end());
1081+
PendingConstrainedFPStrict.clear();
1082+
10591083
if (PendingExports.empty())
10601084
return Root;
10611085

@@ -4060,9 +4084,11 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
40604084

40614085
SDValue Root;
40624086
bool ConstantMemory = false;
4063-
if (isVolatile || NumValues > MaxParallelChains)
4087+
if (isVolatile)
40644088
// Serialize volatile loads with other side effects.
40654089
Root = getRoot();
4090+
else if (NumValues > MaxParallelChains)
4091+
Root = getMemoryRoot();
40664092
else if (AA &&
40674093
AA->pointsToConstantMemory(MemoryLocation(
40684094
SV,
@@ -4237,7 +4263,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
42374263
SDValue Src = getValue(SrcV);
42384264
SDValue Ptr = getValue(PtrV);
42394265

4240-
SDValue Root = getRoot();
4266+
SDValue Root = I.isVolatile() ? getRoot() : getMemoryRoot();
42414267
SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
42424268
SDLoc dl = getCurSDLoc();
42434269
unsigned Alignment = I.getAlignment();
@@ -4329,7 +4355,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
43294355
VT.getStoreSize().getKnownMinSize(),
43304356
Alignment, AAInfo);
43314357
SDValue StoreNode =
4332-
DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
4358+
DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
43334359
ISD::UNINDEXED, false /* Truncating */, IsCompressing);
43344360
DAG.setRoot(StoreNode);
43354361
setValue(&I, StoreNode);
@@ -4463,7 +4489,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
44634489
IndexType = ISD::SIGNED_SCALED;
44644490
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
44654491
}
4466-
SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index, Scale };
4492+
SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale };
44674493
SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
44684494
Ops, MMO, IndexType);
44694495
DAG.setRoot(Scatter);
@@ -5850,7 +5876,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
58505876
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
58515877
// FIXME: Support passing different dest/src alignments to the memcpy DAG
58525878
// node.
5853-
SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
5879+
SDValue Root = isVol ? getRoot() : getMemoryRoot();
5880+
SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Align, isVol,
58545881
false, isTC,
58555882
MachinePointerInfo(I.getArgOperand(0)),
58565883
MachinePointerInfo(I.getArgOperand(1)));
@@ -5866,7 +5893,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
58665893
unsigned Align = std::max<unsigned>(MSI.getDestAlignment(), 1);
58675894
bool isVol = MSI.isVolatile();
58685895
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
5869-
SDValue MS = DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
5896+
SDValue Root = isVol ? getRoot() : getMemoryRoot();
5897+
SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Align, isVol,
58705898
isTC, MachinePointerInfo(I.getArgOperand(0)));
58715899
updateDAGForMaybeTailCall(MS);
58725900
return;
@@ -5884,7 +5912,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
58845912
bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
58855913
// FIXME: Support passing different dest/src alignments to the memmove DAG
58865914
// node.
5887-
SDValue MM = DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
5915+
SDValue Root = isVol ? getRoot() : getMemoryRoot();
5916+
SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Align, isVol,
58885917
isTC, MachinePointerInfo(I.getArgOperand(0)),
58895918
MachinePointerInfo(I.getArgOperand(1)));
58905919
updateDAGForMaybeTailCall(MM);
@@ -7039,9 +7068,29 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
70397068
SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers);
70407069

70417070
assert(Result.getNode()->getNumValues() == 2);
7042-
// See above -- chain is handled like for loads here.
7071+
7072+
// Push node to the appropriate list so that future instructions can be
7073+
// chained up correctly.
70437074
SDValue OutChain = Result.getValue(1);
7044-
PendingLoads.push_back(OutChain);
7075+
switch (FPI.getExceptionBehavior().getValue()) {
7076+
case fp::ExceptionBehavior::ebIgnore:
7077+
// The only reason why ebIgnore nodes still need to be chained is that
7078+
// they might depend on the current rounding mode, and therefore must
7079+
// not be moved across instruction that may change that mode.
7080+
LLVM_FALLTHROUGH;
7081+
case fp::ExceptionBehavior::ebMayTrap:
7082+
// These must not be moved across calls or instructions that may change
7083+
// floating-point exception masks.
7084+
PendingConstrainedFP.push_back(OutChain);
7085+
break;
7086+
case fp::ExceptionBehavior::ebStrict:
7087+
// These must not be moved across calls or instructions that may change
7088+
// floating-point exception masks or read floating-point exception flags.
7089+
// In addition, they cannot be optimized out even if unused.
7090+
PendingConstrainedFPStrict.push_back(OutChain);
7091+
break;
7092+
}
7093+
70457094
SDValue FPResult = Result.getValue(0);
70467095
setValue(&FPI, FPResult);
70477096
}
@@ -7424,7 +7473,8 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
74247473
// In the mempcpy context we need to pass in a false value for isTailCall
74257474
// because the return pointer needs to be adjusted by the size of
74267475
// the copied memory.
7427-
SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Align, isVol,
7476+
SDValue Root = isVol ? getRoot() : getMemoryRoot();
7477+
SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Align, isVol,
74287478
false, /*isTailCall=*/false,
74297479
MachinePointerInfo(I.getArgOperand(0)),
74307480
MachinePointerInfo(I.getArgOperand(1)));

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,17 @@ class SelectionDAGBuilder {
143143
/// tokenfactor for them just before terminator instructions.
144144
SmallVector<SDValue, 8> PendingExports;
145145

146+
/// Similar to loads, nodes corresponding to constrained FP intrinsics are
147+
/// bunched up and emitted when necessary. These can be moved across each
148+
/// other and any (normal) memory operation (load or store), but not across
149+
/// calls or instructions having unspecified side effects. As a special
150+
/// case, constrained FP intrinsics using fpexcept.strict may not be deleted
151+
/// even if otherwise unused, so they need to be chained before any
152+
/// terminator instruction (like PendingExports). We track the latter
153+
/// set of nodes in a separate list.
154+
SmallVector<SDValue, 8> PendingConstrainedFP;
155+
SmallVector<SDValue, 8> PendingConstrainedFPStrict;
156+
146157
/// A unique monotonically increasing number used to order the SDNodes we
147158
/// create.
148159
unsigned SDNodeOrder;
@@ -447,12 +458,18 @@ class SelectionDAGBuilder {
447458

448459
/// Return the current virtual root of the Selection DAG, flushing any
449460
/// PendingLoad items. This must be done before emitting a store or any other
450-
/// node that may need to be ordered after any prior load instructions.
461+
/// memory node that may need to be ordered after any prior load instructions.
462+
SDValue getMemoryRoot();
463+
464+
/// Similar to getMemoryRoot, but also flushes PendingConstrainedFP(Strict)
465+
/// items. This must be done before emitting any call other any other node
466+
/// that may need to be ordered after FP instructions due to other side
467+
/// effects.
451468
SDValue getRoot();
452469

453470
/// Similar to getRoot, but instead of flushing all the PendingLoad items,
454-
/// flush all the PendingExports items. It is necessary to do this before
455-
/// emitting a terminator instruction.
471+
/// flush all the PendingExports (and PendingConstrainedFPStrict) items.
472+
/// It is necessary to do this before emitting a terminator instruction.
456473
SDValue getControlRoot();
457474

458475
SDLoc getCurSDLoc() const {

llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,21 +1358,21 @@ define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %r
13581358
; PC64LE-NEXT: fmr 4, 2
13591359
; PC64LE-NEXT: fmr 30, 1
13601360
; PC64LE-NEXT: fmr 29, 2
1361-
; PC64LE-NEXT: stfd 1, 16(30)
13621361
; PC64LE-NEXT: stfd 2, 24(30)
1362+
; PC64LE-NEXT: stfd 1, 16(30)
13631363
; PC64LE-NEXT: bl __gcc_qmul
13641364
; PC64LE-NEXT: nop
13651365
; PC64LE-NEXT: fmr 1, 31
13661366
; PC64LE-NEXT: xxlxor 2, 2, 2
13671367
; PC64LE-NEXT: li 5, 2
1368-
; PC64LE-NEXT: stfd 30, 32(30)
13691368
; PC64LE-NEXT: stfd 29, 40(30)
1369+
; PC64LE-NEXT: stfd 30, 32(30)
13701370
; PC64LE-NEXT: bl __powitf2
13711371
; PC64LE-NEXT: nop
13721372
; PC64LE-NEXT: frsp 0, 1
13731373
; PC64LE-NEXT: stfsx 0, 0, 29
1374-
; PC64LE-NEXT: stfd 2, -8(30)
13751374
; PC64LE-NEXT: stfd 1, -16(30)
1375+
; PC64LE-NEXT: stfd 2, -8(30)
13761376
; PC64LE-NEXT: addi 1, 1, 80
13771377
; PC64LE-NEXT: ld 0, 16(1)
13781378
; PC64LE-NEXT: lfd 31, -8(1) # 8-byte Folded Reload
@@ -1409,21 +1409,21 @@ define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %r
14091409
; PC64LE9-NEXT: fmr 4, 2
14101410
; PC64LE9-NEXT: fmr 30, 2
14111411
; PC64LE9-NEXT: fmr 29, 1
1412-
; PC64LE9-NEXT: stfd 1, 16(30)
14131412
; PC64LE9-NEXT: stfd 2, 24(30)
1413+
; PC64LE9-NEXT: stfd 1, 16(30)
14141414
; PC64LE9-NEXT: bl __gcc_qmul
14151415
; PC64LE9-NEXT: nop
14161416
; PC64LE9-NEXT: fmr 1, 31
14171417
; PC64LE9-NEXT: xxlxor 2, 2, 2
14181418
; PC64LE9-NEXT: li 5, 2
1419-
; PC64LE9-NEXT: stfd 29, 32(30)
14201419
; PC64LE9-NEXT: stfd 30, 40(30)
1420+
; PC64LE9-NEXT: stfd 29, 32(30)
14211421
; PC64LE9-NEXT: bl __powitf2
14221422
; PC64LE9-NEXT: nop
14231423
; PC64LE9-NEXT: frsp 0, 1
14241424
; PC64LE9-NEXT: stfs 0, 0(29)
1425-
; PC64LE9-NEXT: stfd 2, -8(30)
14261425
; PC64LE9-NEXT: stfd 1, -16(30)
1426+
; PC64LE9-NEXT: stfd 2, -8(30)
14271427
; PC64LE9-NEXT: addi 1, 1, 80
14281428
; PC64LE9-NEXT: ld 0, 16(1)
14291429
; PC64LE9-NEXT: lfd 31, -8(1) # 8-byte Folded Reload
@@ -1463,15 +1463,15 @@ define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %r
14631463
; PC64-NEXT: fmr 4, 2
14641464
; PC64-NEXT: fmr 29, 1
14651465
; PC64-NEXT: fmr 28, 2
1466-
; PC64-NEXT: stfd 1, 16(30)
14671466
; PC64-NEXT: stfd 2, 24(30)
1467+
; PC64-NEXT: stfd 1, 16(30)
14681468
; PC64-NEXT: bl __gcc_qmul
14691469
; PC64-NEXT: nop
14701470
; PC64-NEXT: fmr 1, 31
14711471
; PC64-NEXT: fmr 2, 30
14721472
; PC64-NEXT: li 5, 2
1473-
; PC64-NEXT: stfd 29, 32(30)
14741473
; PC64-NEXT: stfd 28, 40(30)
1474+
; PC64-NEXT: stfd 29, 32(30)
14751475
; PC64-NEXT: bl __powitf2
14761476
; PC64-NEXT: nop
14771477
; PC64-NEXT: frsp 0, 1
@@ -1481,8 +1481,8 @@ define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %r
14811481
; PC64-NEXT: lfd 29, 152(1) # 8-byte Folded Reload
14821482
; PC64-NEXT: lfd 28, 144(1) # 8-byte Folded Reload
14831483
; PC64-NEXT: ld 29, 120(1) # 8-byte Folded Reload
1484-
; PC64-NEXT: stfd 2, -8(30)
14851484
; PC64-NEXT: stfd 1, -16(30)
1485+
; PC64-NEXT: stfd 2, -8(30)
14861486
; PC64-NEXT: ld 30, 128(1) # 8-byte Folded Reload
14871487
; PC64-NEXT: addi 1, 1, 176
14881488
; PC64-NEXT: ld 0, 16(1)

llvm/test/CodeGen/SystemZ/fp-strict-alias.ll

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,4 +287,56 @@ define void @f12(float %f1, float %f2, float *%ptr1, float *%ptr2) #0 {
287287
ret void
288288
}
289289

290+
; If the result of any FP operation is unused, it can be removed
291+
; -- except for fpexcept.strict operations.
292+
293+
define void @f13(float %f1) {
294+
; CHECK-LABEL: f13:
295+
; CHECK-NOT: sqeb
296+
; CHECK: br %r14
297+
298+
%sqrt = call float @llvm.sqrt.f32(float %f1)
299+
300+
ret void
301+
}
302+
303+
define void @f14(float %f1) {
304+
; CHECK-LABEL: f14:
305+
; CHECK-NOT: sqeb
306+
; CHECK: br %r14
307+
308+
%sqrt = call float @llvm.experimental.constrained.sqrt.f32(
309+
float %f1,
310+
metadata !"round.dynamic",
311+
metadata !"fpexcept.ignore") #0
312+
313+
ret void
314+
}
315+
316+
define void @f15(float %f1) {
317+
; CHECK-LABEL: f15:
318+
; CHECK-NOT: sqeb
319+
; CHECK: br %r14
320+
321+
%sqrt = call float @llvm.experimental.constrained.sqrt.f32(
322+
float %f1,
323+
metadata !"round.dynamic",
324+
metadata !"fpexcept.maytrap") #0
325+
326+
ret void
327+
}
328+
329+
define void @f16(float %f1) {
330+
; CHECK-LABEL: f16:
331+
; CHECK: sqebr
332+
; CHECK: br %r14
333+
334+
%sqrt = call float @llvm.experimental.constrained.sqrt.f32(
335+
float %f1,
336+
metadata !"round.dynamic",
337+
metadata !"fpexcept.strict") #0
338+
339+
ret void
340+
}
341+
290342
attributes #0 = { strictfp }

0 commit comments

Comments
 (0)