Skip to content

Commit f138e36

Browse files
[SelectionDAG][RISCV] Avoid store merging across function calls (#130430)
This patch improves DAGCombiner's handling of potential store merges by detecting function calls between loads and stores. When a function call exists in the chain between a load and its corresponding store, we avoid merging these stores if the spilling is unprofitable. We had to implement a hook on TLI, since TTI is unavailable in DAGCombine. Currently, it's only enabled for riscv. This is the DAG equivalent of PR #129258
1 parent fbaf3b8 commit f138e36

File tree

4 files changed

+108
-64
lines changed

4 files changed

+108
-64
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3520,6 +3520,10 @@ class TargetLoweringBase {
35203520
/// The default implementation just freezes the set of reserved registers.
35213521
virtual void finalizeLowering(MachineFunction &MF) const;
35223522

3523+
/// Returns true if it's profitable to allow merging store of loads when there
3524+
/// are functions calls between the load and the store.
3525+
virtual bool shouldMergeStoreOfLoadsOverCall(EVT, EVT) const { return true; }
3526+
35233527
//===----------------------------------------------------------------------===//
35243528
// GlobalISel Hooks
35253529
//===----------------------------------------------------------------------===//

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,10 @@ namespace {
792792
SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
793793
SDNode *RootNode);
794794

795+
/// Helper function for tryStoreMergeOfLoads. Checks if the load/store
796+
/// chain has a call in it. \return True if a call is found.
797+
bool hasCallInLdStChain(StoreSDNode *St, LoadSDNode *Ld);
798+
795799
/// This is a helper function for mergeConsecutiveStores. Given a list of
796800
/// store candidates, find the first N that are consecutive in memory.
797801
/// Returns 0 if there are not at least 2 consecutive stores to try merging.
@@ -21152,6 +21156,38 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
2115221156
return true;
2115321157
}
2115421158

21159+
bool DAGCombiner::hasCallInLdStChain(StoreSDNode *St, LoadSDNode *Ld) {
21160+
SmallPtrSet<const SDNode *, 32> Visited;
21161+
SmallVector<std::pair<const SDNode *, bool>, 8> Worklist;
21162+
Worklist.emplace_back(St->getChain().getNode(), false);
21163+
21164+
while (!Worklist.empty()) {
21165+
auto [Node, FoundCall] = Worklist.pop_back_val();
21166+
if (!Visited.insert(Node).second || Node->getNumOperands() == 0)
21167+
continue;
21168+
21169+
switch (Node->getOpcode()) {
21170+
case ISD::CALLSEQ_END:
21171+
Worklist.emplace_back(Node->getOperand(0).getNode(), true);
21172+
break;
21173+
case ISD::TokenFactor:
21174+
for (SDValue Op : Node->ops())
21175+
Worklist.emplace_back(Op.getNode(), FoundCall);
21176+
break;
21177+
case ISD::LOAD:
21178+
if (Node == Ld)
21179+
return FoundCall;
21180+
[[fallthrough]];
21181+
default:
21182+
assert(Node->getOperand(0).getValueType() == MVT::Other &&
21183+
"Invalid chain type");
21184+
Worklist.emplace_back(Node->getOperand(0).getNode(), FoundCall);
21185+
break;
21186+
}
21187+
}
21188+
return false;
21189+
}
21190+
2115521191
unsigned
2115621192
DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
2115721193
int64_t ElementSizeBytes) const {
@@ -21598,6 +21634,16 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
2159821634
JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
2159921635
}
2160021636

21637+
// Check if there is a call in the load/store chain.
21638+
if (!TLI.shouldMergeStoreOfLoadsOverCall(MemVT, JointMemOpVT) &&
21639+
hasCallInLdStChain(cast<StoreSDNode>(StoreNodes[0].MemNode),
21640+
cast<LoadSDNode>(LoadNodes[0].MemNode))) {
21641+
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
21642+
LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
21643+
NumConsecutiveStores -= NumElem;
21644+
continue;
21645+
}
21646+
2160121647
SDLoc LoadDL(LoadNodes[0].MemNode);
2160221648
SDLoc StoreDL(StoreNodes[0].MemNode);
2160321649

llvm/lib/Target/RISCV/RISCVISelLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,13 @@ class RISCVTargetLowering : public TargetLowering {
10701070
return false;
10711071
}
10721072

1073+
/// Disables storing and loading vectors by default when there are function
1074+
/// calls between the load and store, since these are more expensive than just
1075+
/// using scalars
1076+
bool shouldMergeStoreOfLoadsOverCall(EVT SrcVT, EVT MergedVT) const override {
1077+
return !MergedVT.isVector() || SrcVT.isVector();
1078+
}
1079+
10731080
/// For available scheduling models FDIV + two independent FMULs are much
10741081
/// faster than two FDIVs.
10751082
unsigned combineRepeatedFPDivisors() const override;

llvm/test/CodeGen/RISCV/rvv/stores-of-loads-merging.ll

Lines changed: 51 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,8 @@
22
; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+b | FileCheck %s --check-prefixes=CHECK,V
33
; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+b,+zvfh | FileCheck %s --check-prefixes=CHECK,ZVFH
44

5-
declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)
65
declare void @g()
76

8-
; TODO: Merging scalars into vectors is unprofitable because we have no
9-
; vector CSRs which creates additional spills around the call.
107
define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
118
; CHECK-LABEL: f:
129
; CHECK: # %bb.0:
@@ -16,40 +13,40 @@ define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
1613
; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill
1714
; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill
1815
; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill
16+
; CHECK-NEXT: sd s3, 8(sp) # 8-byte Folded Spill
17+
; CHECK-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
1918
; CHECK-NEXT: .cfi_offset ra, -8
2019
; CHECK-NEXT: .cfi_offset s0, -16
2120
; CHECK-NEXT: .cfi_offset s1, -24
2221
; CHECK-NEXT: .cfi_offset s2, -32
23-
; CHECK-NEXT: csrr a6, vlenb
24-
; CHECK-NEXT: sub sp, sp, a6
25-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
22+
; CHECK-NEXT: .cfi_offset s3, -40
23+
; CHECK-NEXT: .cfi_offset s4, -48
2624
; CHECK-NEXT: mv s0, a5
2725
; CHECK-NEXT: mv s1, a4
2826
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
2927
; CHECK-NEXT: vle64.v v8, (a0)
3028
; CHECK-NEXT: vse64.v v8, (a1)
31-
; CHECK-NEXT: vle64.v v8, (a2)
32-
; CHECK-NEXT: addi a0, sp, 16
33-
; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
29+
; CHECK-NEXT: ld s3, 0(a2)
30+
; CHECK-NEXT: ld s4, 8(a2)
3431
; CHECK-NEXT: mv s2, a3
3532
; CHECK-NEXT: call g
36-
; CHECK-NEXT: addi a0, sp, 16
37-
; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
33+
; CHECK-NEXT: sd s3, 0(s2)
34+
; CHECK-NEXT: sd s4, 8(s2)
3835
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
39-
; CHECK-NEXT: vse64.v v8, (s2)
4036
; CHECK-NEXT: vle64.v v8, (s1)
4137
; CHECK-NEXT: vse64.v v8, (s0)
42-
; CHECK-NEXT: csrr a0, vlenb
43-
; CHECK-NEXT: add sp, sp, a0
44-
; CHECK-NEXT: .cfi_def_cfa sp, 48
4538
; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
4639
; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
4740
; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
4841
; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload
42+
; CHECK-NEXT: ld s3, 8(sp) # 8-byte Folded Reload
43+
; CHECK-NEXT: ld s4, 0(sp) # 8-byte Folded Reload
4944
; CHECK-NEXT: .cfi_restore ra
5045
; CHECK-NEXT: .cfi_restore s0
5146
; CHECK-NEXT: .cfi_restore s1
5247
; CHECK-NEXT: .cfi_restore s2
48+
; CHECK-NEXT: .cfi_restore s3
49+
; CHECK-NEXT: .cfi_restore s4
5350
; CHECK-NEXT: addi sp, sp, 48
5451
; CHECK-NEXT: .cfi_def_cfa_offset 0
5552
; CHECK-NEXT: ret
@@ -78,13 +75,13 @@ define void @f(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
7875
ret void
7976
}
8077

81-
define void @f1(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
78+
define void @f1(ptr %p, ptr %q, double %t) {
8279
; CHECK-LABEL: f1:
8380
; CHECK: # %bb.0:
8481
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
85-
; CHECK-NEXT: vle64.v v8, (a2)
82+
; CHECK-NEXT: vle64.v v8, (a0)
8683
; CHECK-NEXT: fcvt.wu.d a0, fa0, rtz
87-
; CHECK-NEXT: vse64.v v8, (a3)
84+
; CHECK-NEXT: vse64.v v8, (a1)
8885
; CHECK-NEXT: ret
8986
%x0 = load i64, ptr %p
9087
%p.1 = getelementptr i64, ptr %p, i64 1
@@ -93,7 +90,6 @@ define void @f1(ptr %m, ptr %n, ptr %p, ptr %q, ptr %r, ptr %s, double %t) {
9390
store i64 %x0, ptr %q
9491
%q.1 = getelementptr i64, ptr %q, i64 1
9592
store i64 %x1, ptr %q.1
96-
9793
ret void
9894
}
9995

@@ -515,28 +511,26 @@ define void @two_half_unaligned(ptr %p, ptr %q) {
515511
; ZVFH-NEXT: .cfi_def_cfa_offset 32
516512
; ZVFH-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
517513
; ZVFH-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
514+
; ZVFH-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill
515+
; ZVFH-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill
518516
; ZVFH-NEXT: .cfi_offset ra, -8
519517
; ZVFH-NEXT: .cfi_offset s0, -16
520-
; ZVFH-NEXT: csrr a2, vlenb
521-
; ZVFH-NEXT: sub sp, sp, a2
522-
; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb
523-
; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
524-
; ZVFH-NEXT: vle16.v v8, (a0)
525-
; ZVFH-NEXT: addi a0, sp, 16
526-
; ZVFH-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
518+
; ZVFH-NEXT: .cfi_offset fs0, -24
519+
; ZVFH-NEXT: .cfi_offset fs1, -32
520+
; ZVFH-NEXT: flh fs0, 0(a0)
521+
; ZVFH-NEXT: flh fs1, 2(a0)
527522
; ZVFH-NEXT: mv s0, a1
528523
; ZVFH-NEXT: call g
529-
; ZVFH-NEXT: addi a0, sp, 16
530-
; ZVFH-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
531-
; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
532-
; ZVFH-NEXT: vse16.v v8, (s0)
533-
; ZVFH-NEXT: csrr a0, vlenb
534-
; ZVFH-NEXT: add sp, sp, a0
535-
; ZVFH-NEXT: .cfi_def_cfa sp, 32
524+
; ZVFH-NEXT: fsh fs0, 0(s0)
525+
; ZVFH-NEXT: fsh fs1, 2(s0)
536526
; ZVFH-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
537527
; ZVFH-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
528+
; ZVFH-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload
529+
; ZVFH-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload
538530
; ZVFH-NEXT: .cfi_restore ra
539531
; ZVFH-NEXT: .cfi_restore s0
532+
; ZVFH-NEXT: .cfi_restore fs0
533+
; ZVFH-NEXT: .cfi_restore fs1
540534
; ZVFH-NEXT: addi sp, sp, 32
541535
; ZVFH-NEXT: .cfi_def_cfa_offset 0
542536
; ZVFH-NEXT: ret
@@ -552,9 +546,6 @@ define void @two_half_unaligned(ptr %p, ptr %q) {
552546
ret void
553547
}
554548

555-
556-
; TODO: This one is currently a vector which is unprofitable, we should
557-
; use i64 instead.
558549
define void @two_float(ptr %p, ptr %q) {
559550
; CHECK-LABEL: two_float:
560551
; CHECK: # %bb.0:
@@ -598,28 +589,26 @@ define void @two_float_unaligned(ptr %p, ptr %q) {
598589
; CHECK-NEXT: .cfi_def_cfa_offset 32
599590
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
600591
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
592+
; CHECK-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill
593+
; CHECK-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill
601594
; CHECK-NEXT: .cfi_offset ra, -8
602595
; CHECK-NEXT: .cfi_offset s0, -16
603-
; CHECK-NEXT: csrr a2, vlenb
604-
; CHECK-NEXT: sub sp, sp, a2
605-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb
606-
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
607-
; CHECK-NEXT: vle32.v v8, (a0)
608-
; CHECK-NEXT: addi a0, sp, 16
609-
; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
596+
; CHECK-NEXT: .cfi_offset fs0, -24
597+
; CHECK-NEXT: .cfi_offset fs1, -32
598+
; CHECK-NEXT: flw fs0, 0(a0)
599+
; CHECK-NEXT: flw fs1, 4(a0)
610600
; CHECK-NEXT: mv s0, a1
611601
; CHECK-NEXT: call g
612-
; CHECK-NEXT: addi a0, sp, 16
613-
; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
614-
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
615-
; CHECK-NEXT: vse32.v v8, (s0)
616-
; CHECK-NEXT: csrr a0, vlenb
617-
; CHECK-NEXT: add sp, sp, a0
618-
; CHECK-NEXT: .cfi_def_cfa sp, 32
602+
; CHECK-NEXT: fsw fs0, 0(s0)
603+
; CHECK-NEXT: fsw fs1, 4(s0)
619604
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
620605
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
606+
; CHECK-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload
607+
; CHECK-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload
621608
; CHECK-NEXT: .cfi_restore ra
622609
; CHECK-NEXT: .cfi_restore s0
610+
; CHECK-NEXT: .cfi_restore fs0
611+
; CHECK-NEXT: .cfi_restore fs1
623612
; CHECK-NEXT: addi sp, sp, 32
624613
; CHECK-NEXT: .cfi_def_cfa_offset 0
625614
; CHECK-NEXT: ret
@@ -679,28 +668,26 @@ define void @two_double(ptr %p, ptr %q) {
679668
; CHECK-NEXT: .cfi_def_cfa_offset 32
680669
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
681670
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
671+
; CHECK-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill
672+
; CHECK-NEXT: fsd fs1, 0(sp) # 8-byte Folded Spill
682673
; CHECK-NEXT: .cfi_offset ra, -8
683674
; CHECK-NEXT: .cfi_offset s0, -16
684-
; CHECK-NEXT: csrr a2, vlenb
685-
; CHECK-NEXT: sub sp, sp, a2
686-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb
687-
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
688-
; CHECK-NEXT: vle64.v v8, (a0)
689-
; CHECK-NEXT: addi a0, sp, 16
690-
; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
675+
; CHECK-NEXT: .cfi_offset fs0, -24
676+
; CHECK-NEXT: .cfi_offset fs1, -32
677+
; CHECK-NEXT: fld fs0, 0(a0)
678+
; CHECK-NEXT: fld fs1, 8(a0)
691679
; CHECK-NEXT: mv s0, a1
692680
; CHECK-NEXT: call g
693-
; CHECK-NEXT: addi a0, sp, 16
694-
; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
695-
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
696-
; CHECK-NEXT: vse64.v v8, (s0)
697-
; CHECK-NEXT: csrr a0, vlenb
698-
; CHECK-NEXT: add sp, sp, a0
699-
; CHECK-NEXT: .cfi_def_cfa sp, 32
681+
; CHECK-NEXT: fsd fs0, 0(s0)
682+
; CHECK-NEXT: fsd fs1, 8(s0)
700683
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
701684
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
685+
; CHECK-NEXT: fld fs0, 8(sp) # 8-byte Folded Reload
686+
; CHECK-NEXT: fld fs1, 0(sp) # 8-byte Folded Reload
702687
; CHECK-NEXT: .cfi_restore ra
703688
; CHECK-NEXT: .cfi_restore s0
689+
; CHECK-NEXT: .cfi_restore fs0
690+
; CHECK-NEXT: .cfi_restore fs1
704691
; CHECK-NEXT: addi sp, sp, 32
705692
; CHECK-NEXT: .cfi_def_cfa_offset 0
706693
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)