Skip to content

Commit 5578ec3

Browse files
committed
[MCA] Fixed a bug where loads and stores were sometimes incorrectly marked as depedent. Fixes PR45793.
This fixes a regression introduced by a very old commit 280ac1f (was llvm-svn 361950). Commit 280ac1f redesigned the logic in the LSUnit with the goal of speeding up isReady() queries, and stabilising the LSUnit API (while also making the load store unit more customisable). The concept of MemoryGroup (effectively an alias set) was added by that commit to better describe and track dependencies between memory operations. However, that concept was not just used for alias dependencies, but it was also used for describing memory "order" dependencies (enforced by the memory consistency model). Instructions of a same memory group were considered "equivalent" as in: independent operations that can potentially execute in parallel. The problem was that the cost of a dependency (in terms of number of cycles) should have been different for "order" dependency. Instructions in an order dependency simply have to have to wait until their predecessors are "issued" to an underlying pipeline (rather than having to wait until predecessors have beeng fully executed). For simple "order" dependencies, this was effectively introducing an artificial delay on the "issue" of independent loads and stores. This patch fixes the issue and adds a new test named 'independent-load-stores.s' to a bunch of x86 targets. That test contains the reproducible posted by Fabian Ritter on PR45793. I had to rerun the update-mca-tests script on several files. To avoid expected regressions on some Exynos tests, I have added a -noalias=false flag (to match the old strict behavior on latencies). Some tests for processor Barcelona are improved/fixed by this change and they now show better results. In a few tests we were incorrectly counting the time spent by instructions in a scheduler queue. In one case in particular we now correctly see a store executed out of order. That test was affected by the same underlying issue reported as PR45793. Reviewers: mattd Differential Revision: https://reviews.llvm.org/D79351
1 parent 08032e7 commit 5578ec3

File tree

18 files changed

+974
-339
lines changed

18 files changed

+974
-339
lines changed

llvm/include/llvm/MCA/HardwareUnits/LSUnit.h

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,10 @@ class MemoryGroup {
4040
unsigned NumInstructions;
4141
unsigned NumExecuting;
4242
unsigned NumExecuted;
43-
SmallVector<MemoryGroup *, 4> Succ;
43+
// Successors that are in a order dependency with this group.
44+
SmallVector<MemoryGroup *, 4> OrderSucc;
45+
// Successors that are in a data dependency with this group.
46+
SmallVector<MemoryGroup *, 4> DataSucc;
4447

4548
CriticalDependency CriticalPredecessor;
4649
InstRef CriticalMemoryInstruction;
@@ -55,8 +58,9 @@ class MemoryGroup {
5558
NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
5659
MemoryGroup(MemoryGroup &&) = default;
5760

58-
ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; }
59-
unsigned getNumSuccessors() const { return Succ.size(); }
61+
size_t getNumSuccessors() const {
62+
return OrderSucc.size() + DataSucc.size();
63+
}
6064
unsigned getNumPredecessors() const { return NumPredecessors; }
6165
unsigned getNumExecutingPredecessors() const {
6266
return NumExecutingPredecessors;
@@ -75,12 +79,22 @@ class MemoryGroup {
7579
return CriticalPredecessor;
7680
}
7781

78-
void addSuccessor(MemoryGroup *Group) {
82+
void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
83+
// Do not need to add a dependency if there is no data
84+
// dependency and all instructions from this group have been
85+
// issued already.
86+
if (!IsDataDependent && isExecuting())
87+
return;
88+
7989
Group->NumPredecessors++;
8090
assert(!isExecuted() && "Should have been removed!");
8191
if (isExecuting())
82-
Group->onGroupIssued(CriticalMemoryInstruction);
83-
Succ.emplace_back(Group);
92+
Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
93+
94+
if (IsDataDependent)
95+
DataSucc.emplace_back(Group);
96+
else
97+
OrderSucc.emplace_back(Group);
8498
}
8599

86100
bool isWaiting() const {
@@ -98,10 +112,13 @@ class MemoryGroup {
98112
}
99113
bool isExecuted() const { return NumInstructions == NumExecuted; }
100114

101-
void onGroupIssued(const InstRef &IR) {
115+
void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
102116
assert(!isReady() && "Unexpected group-start event!");
103117
NumExecutingPredecessors++;
104118

119+
if (!ShouldUpdateCriticalDep)
120+
return;
121+
105122
unsigned Cycles = IR.getInstruction()->getCyclesLeft();
106123
if (CriticalPredecessor.Cycles < Cycles) {
107124
CriticalPredecessor.IID = IR.getSourceIndex();
@@ -133,8 +150,14 @@ class MemoryGroup {
133150
return;
134151

135152
// Notify successors that this group started execution.
136-
for (MemoryGroup *MG : Succ)
137-
MG->onGroupIssued(CriticalMemoryInstruction);
153+
for (MemoryGroup *MG : OrderSucc) {
154+
MG->onGroupIssued(CriticalMemoryInstruction, false);
155+
// Release the order dependency with this group.
156+
MG->onGroupExecuted();
157+
}
158+
159+
for (MemoryGroup *MG : DataSucc)
160+
MG->onGroupIssued(CriticalMemoryInstruction, true);
138161
}
139162

140163
void onInstructionExecuted() {
@@ -145,8 +168,8 @@ class MemoryGroup {
145168
if (!isExecuted())
146169
return;
147170

148-
// Notify successors that this group has finished execution.
149-
for (MemoryGroup *MG : Succ)
171+
// Notify data dependent successors that this group has finished execution.
172+
for (MemoryGroup *MG : DataSucc)
150173
MG->onGroupExecuted();
151174
}
152175

@@ -412,6 +435,7 @@ class LSUnit : public LSUnitBase {
412435
unsigned CurrentLoadGroupID;
413436
unsigned CurrentLoadBarrierGroupID;
414437
unsigned CurrentStoreGroupID;
438+
unsigned CurrentStoreBarrierGroupID;
415439

416440
public:
417441
LSUnit(const MCSchedModel &SM)
@@ -420,7 +444,8 @@ class LSUnit : public LSUnitBase {
420444
: LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
421445
LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
422446
: LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
423-
CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
447+
CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
448+
CurrentStoreBarrierGroupID(0) {}
424449

425450
/// Returns LSU_AVAILABLE if there are enough load/store queue entries to
426451
/// accomodate instruction IR.

llvm/lib/MCA/HardwareUnits/LSUnit.cpp

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,6 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
7777
acquireSQSlot();
7878

7979
if (Desc.MayStore) {
80-
// Always create a new group for store operations.
81-
82-
// A store may not pass a previous store or store barrier.
8380
unsigned NewGID = createMemoryGroup();
8481
MemoryGroup &NewGroup = getGroup(NewGID);
8582
NewGroup.addInstruction();
@@ -91,16 +88,32 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
9188
MemoryGroup &IDom = getGroup(ImmediateLoadDominator);
9289
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << ImmediateLoadDominator
9390
<< ") --> (" << NewGID << ")\n");
94-
IDom.addSuccessor(&NewGroup);
91+
IDom.addSuccessor(&NewGroup, !assumeNoAlias());
92+
}
93+
94+
// A store may not pass a previous store barrier.
95+
if (CurrentStoreBarrierGroupID) {
96+
MemoryGroup &StoreGroup = getGroup(CurrentStoreBarrierGroupID);
97+
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
98+
<< CurrentStoreBarrierGroupID
99+
<< ") --> (" << NewGID << ")\n");
100+
StoreGroup.addSuccessor(&NewGroup, true);
95101
}
96-
if (CurrentStoreGroupID) {
102+
103+
// A store may not pass a previous store.
104+
if (CurrentStoreGroupID &&
105+
(CurrentStoreGroupID != CurrentStoreBarrierGroupID)) {
97106
MemoryGroup &StoreGroup = getGroup(CurrentStoreGroupID);
98107
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentStoreGroupID
99108
<< ") --> (" << NewGID << ")\n");
100-
StoreGroup.addSuccessor(&NewGroup);
109+
StoreGroup.addSuccessor(&NewGroup, !assumeNoAlias());
101110
}
102111

112+
103113
CurrentStoreGroupID = NewGID;
114+
if (IsMemBarrier)
115+
CurrentStoreBarrierGroupID = NewGID;
116+
104117
if (Desc.MayLoad) {
105118
CurrentLoadGroupID = NewGID;
106119
if (IsMemBarrier)
@@ -112,31 +125,59 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
112125

113126
assert(Desc.MayLoad && "Expected a load!");
114127

115-
// Always create a new memory group if this is the first load of the sequence.
128+
unsigned ImmediateLoadDominator =
129+
std::max(CurrentLoadGroupID, CurrentLoadBarrierGroupID);
130+
131+
// A new load group is created if we are in one of the following situations:
132+
// 1) This is a load barrier (by construction, a load barrier is always
133+
// assigned to a different memory group).
134+
// 2) There is no load in flight (by construction we always keep loads and
135+
// stores into separate memory groups).
136+
// 3) There is a load barrier in flight. This load depends on it.
137+
// 4) There is an intervening store between the last load dispatched to the
138+
// LSU and this load. We always create a new group even if this load
139+
// does not alias the last dispatched store.
140+
// 5) There is no intervening store and there is an active load group.
141+
// However that group has already started execution, so we cannot add
142+
// this load to it.
143+
bool ShouldCreateANewGroup =
144+
IsMemBarrier || !ImmediateLoadDominator ||
145+
CurrentLoadBarrierGroupID == ImmediateLoadDominator ||
146+
ImmediateLoadDominator <= CurrentStoreGroupID ||
147+
getGroup(ImmediateLoadDominator).isExecuting();
116148

117-
// A load may not pass a previous store unless flag 'NoAlias' is set.
118-
// A load may pass a previous load.
119-
// A younger load cannot pass a older load barrier.
120-
// A load barrier cannot pass a older load.
121-
bool ShouldCreateANewGroup = !CurrentLoadGroupID || IsMemBarrier ||
122-
CurrentLoadGroupID <= CurrentStoreGroupID ||
123-
CurrentLoadGroupID <= CurrentLoadBarrierGroupID;
124149
if (ShouldCreateANewGroup) {
125150
unsigned NewGID = createMemoryGroup();
126151
MemoryGroup &NewGroup = getGroup(NewGID);
127152
NewGroup.addInstruction();
128153

154+
// A load may not pass a previous store or store barrier
155+
// unless flag 'NoAlias' is set.
129156
if (!assumeNoAlias() && CurrentStoreGroupID) {
130-
MemoryGroup &StGroup = getGroup(CurrentStoreGroupID);
157+
MemoryGroup &StoreGroup = getGroup(CurrentStoreGroupID);
131158
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentStoreGroupID
132159
<< ") --> (" << NewGID << ")\n");
133-
StGroup.addSuccessor(&NewGroup);
160+
StoreGroup.addSuccessor(&NewGroup, true);
134161
}
135-
if (CurrentLoadBarrierGroupID) {
136-
MemoryGroup &LdGroup = getGroup(CurrentLoadBarrierGroupID);
137-
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentLoadBarrierGroupID
138-
<< ") --> (" << NewGID << ")\n");
139-
LdGroup.addSuccessor(&NewGroup);
162+
163+
// A load barrier may not pass a previous load or load barrier.
164+
if (IsMemBarrier) {
165+
if (ImmediateLoadDominator) {
166+
MemoryGroup &LoadGroup = getGroup(ImmediateLoadDominator);
167+
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
168+
<< ImmediateLoadDominator
169+
<< ") --> (" << NewGID << ")\n");
170+
LoadGroup.addSuccessor(&NewGroup, true);
171+
}
172+
} else {
173+
// A younger load cannot pass a older load barrier.
174+
if (CurrentLoadBarrierGroupID) {
175+
MemoryGroup &LoadGroup = getGroup(CurrentLoadBarrierGroupID);
176+
LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
177+
<< CurrentLoadBarrierGroupID
178+
<< ") --> (" << NewGID << ")\n");
179+
LoadGroup.addSuccessor(&NewGroup, true);
180+
}
140181
}
141182

142183
CurrentLoadGroupID = NewGID;
@@ -145,6 +186,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
145186
return NewGID;
146187
}
147188

189+
// A load may pass a previous load.
148190
MemoryGroup &Group = getGroup(CurrentLoadGroupID);
149191
Group.addInstruction();
150192
return CurrentLoadGroupID;

llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st1.s

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
3-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
4-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
2+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
3+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
4+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5
55

66
st1 {v0.s}[0], [sp]
77
st1 {v0.2s}, [sp]

llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st2.s

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
3-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
4-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
2+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
3+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
4+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5
55

66
st2 {v0.s, v1.s}[0], [sp]
77
st2 {v0.2s, v1.2s}, [sp]

llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st3.s

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
3-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
4-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
2+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
3+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
4+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5
55

66
st3 {v0.s, v1.s, v2.s}[0], [sp]
77
st3 {v0.2s, v1.2s, v2.2s}, [sp]

llvm/test/tools/llvm-mca/AArch64/Exynos/asimd-st4.s

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
3-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
4-
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
2+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
3+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
4+
# RUN: llvm-mca -mtriple=aarch64-linux-gnu -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5
55

66
st4 {v0.s, v1.s, v2.s, v3.s}[0], [sp]
77
st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [sp]

llvm/test/tools/llvm-mca/AArch64/Exynos/float-store.s

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2-
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
3-
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
4-
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
2+
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
3+
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
4+
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5
55

66
stur d0, [sp, #2]
77
stur q0, [sp, #16]

llvm/test/tools/llvm-mca/AArch64/Exynos/store.s

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2-
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M3
3-
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M4
4-
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false < %s | FileCheck %s -check-prefixes=ALL,M5
2+
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m3 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M3
3+
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m4 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M4
4+
# RUN: llvm-mca -march=aarch64 -mcpu=exynos-m5 -resource-pressure=false -noalias=false < %s | FileCheck %s -check-prefixes=ALL,M5
55

66
stur x0, [sp, #8]
77
strb w0, [sp], #1

0 commit comments

Comments
 (0)