Skip to content

Commit 4142c3e

Browse files
Rik van RielIngo Molnar
authored andcommitted
sched/numa: Spread memory according to CPU and memory use
The pseudo-interleaving in NUMA placement has a fundamental problem: using hard usage thresholds to spread memory equally between nodes can prevent workloads from converging, or keep memory "trapped" on nodes where the workload is barely running any more. In order for workloads to properly converge, the memory migration should not be stopped when nodes reach parity, but instead be distributed according to how heavily memory is used from each node. This way memory migration and task migration reinforce each other, instead of one putting the brakes on the other. Remove the hard thresholds from the pseudo-interleaving code, and instead use a more gradual policy on memory placement. This also seems to improve convergence of workloads that do not run flat out, but sleep in between bursts of activity. We still want to slow down NUMA scanning and migration once a workload has settled on a few actively used nodes, so keep the 3/4 hysteresis in place. Keep track of whether a workload is actively running on multiple nodes, so task_numa_migrate does a full scan of the system for better task placement. In the case of running 3 SPECjbb2005 instances on a 4 node system, this code seems to result in fairer distribution of memory between nodes, with more memory bandwidth for each instance. Signed-off-by: Rik van Riel <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Mike Galbraith <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: [email protected] Link: http://lkml.kernel.org/r/[email protected] [ Minor readability tweaks. ] Signed-off-by: Ingo Molnar <[email protected]>
1 parent cb25176 commit 4142c3e

File tree

1 file changed

+47
-40
lines changed

1 file changed

+47
-40
lines changed

kernel/sched/fair.c

Lines changed: 47 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -932,10 +932,11 @@ struct numa_group {
932932
spinlock_t lock; /* nr_tasks, tasks */
933933
int nr_tasks;
934934
pid_t gid;
935+
int active_nodes;
935936

936937
struct rcu_head rcu;
937-
nodemask_t active_nodes;
938938
unsigned long total_faults;
939+
unsigned long max_faults_cpu;
939940
/*
940941
* Faults_cpu is used to decide whether memory should move
941942
* towards the CPU. As a consequence, these stats are weighted
@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
994995
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
995996
}
996997

998+
/*
999+
* A node triggering more than 1/3 as many NUMA faults as the maximum is
1000+
* considered part of a numa group's pseudo-interleaving set. Migrations
1001+
* between these nodes are slowed down, to allow things to settle down.
1002+
*/
1003+
#define ACTIVE_NODE_FRACTION 3
1004+
1005+
static bool numa_is_active_node(int nid, struct numa_group *ng)
1006+
{
1007+
return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1008+
}
1009+
9971010
/* Handle placement on systems where not all nodes are directly connected. */
9981011
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
9991012
int maxdist, bool task)
@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
11431156
return true;
11441157

11451158
/*
1146-
* Do not migrate if the destination is not a node that
1147-
* is actively used by this numa group.
1159+
* Destination node is much more heavily used than the source
1160+
* node? Allow migration.
11481161
*/
1149-
if (!node_isset(dst_nid, ng->active_nodes))
1150-
return false;
1151-
1152-
/*
1153-
* Source is a node that is not actively used by this
1154-
* numa group, while the destination is. Migrate.
1155-
*/
1156-
if (!node_isset(src_nid, ng->active_nodes))
1162+
if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1163+
ACTIVE_NODE_FRACTION)
11571164
return true;
11581165

11591166
/*
1160-
* Both source and destination are nodes in active
1161-
* use by this numa group. Maximize memory bandwidth
1162-
* by migrating from more heavily used groups, to less
1163-
* heavily used ones, spreading the load around.
1164-
* Use a 1/4 hysteresis to avoid spurious page movement.
1167+
* Distribute memory according to CPU & memory use on each node,
1168+
* with 3/4 hysteresis to avoid unnecessary memory migrations:
1169+
*
1170+
* faults_cpu(dst) 3 faults_cpu(src)
1171+
* --------------- * - > ---------------
1172+
* faults_mem(dst) 4 faults_mem(src)
11651173
*/
1166-
return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1174+
return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1175+
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
11671176
}
11681177

11691178
static unsigned long weighted_cpuload(const int cpu);
@@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
15091518

15101519
.best_task = NULL,
15111520
.best_imp = 0,
1512-
.best_cpu = -1
1521+
.best_cpu = -1,
15131522
};
15141523
struct sched_domain *sd;
15151524
unsigned long taskweight, groupweight;
@@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
15611570
* multiple NUMA nodes; in order to better consolidate the group,
15621571
* we need to check other locations.
15631572
*/
1564-
if (env.best_cpu == -1 || (p->numa_group &&
1565-
nodes_weight(p->numa_group->active_nodes) > 1)) {
1573+
if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
15661574
for_each_online_node(nid) {
15671575
if (nid == env.src_nid || nid == p->numa_preferred_nid)
15681576
continue;
@@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
15971605
* trying for a better one later. Do not set the preferred node here.
15981606
*/
15991607
if (p->numa_group) {
1608+
struct numa_group *ng = p->numa_group;
1609+
16001610
if (env.best_cpu == -1)
16011611
nid = env.src_nid;
16021612
else
16031613
nid = env.dst_nid;
16041614

1605-
if (node_isset(nid, p->numa_group->active_nodes))
1615+
if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
16061616
sched_setnuma(p, env.dst_nid);
16071617
}
16081618

@@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
16521662
}
16531663

16541664
/*
1655-
* Find the nodes on which the workload is actively running. We do this by
1665+
* Find out how many nodes on the workload is actively running on. Do this by
16561666
* tracking the nodes from which NUMA hinting faults are triggered. This can
16571667
* be different from the set of nodes where the workload's memory is currently
16581668
* located.
1659-
*
1660-
* The bitmask is used to make smarter decisions on when to do NUMA page
1661-
* migrations, To prevent flip-flopping, and excessive page migrations, nodes
1662-
* are added when they cause over 6/16 of the maximum number of faults, but
1663-
* only removed when they drop below 3/16.
16641669
*/
1665-
static void update_numa_active_node_mask(struct numa_group *numa_group)
1670+
static void numa_group_count_active_nodes(struct numa_group *numa_group)
16661671
{
16671672
unsigned long faults, max_faults = 0;
1668-
int nid;
1673+
int nid, active_nodes = 0;
16691674

16701675
for_each_online_node(nid) {
16711676
faults = group_faults_cpu(numa_group, nid);
@@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
16751680

16761681
for_each_online_node(nid) {
16771682
faults = group_faults_cpu(numa_group, nid);
1678-
if (!node_isset(nid, numa_group->active_nodes)) {
1679-
if (faults > max_faults * 6 / 16)
1680-
node_set(nid, numa_group->active_nodes);
1681-
} else if (faults < max_faults * 3 / 16)
1682-
node_clear(nid, numa_group->active_nodes);
1683+
if (faults * ACTIVE_NODE_FRACTION > max_faults)
1684+
active_nodes++;
16831685
}
1686+
1687+
numa_group->max_faults_cpu = max_faults;
1688+
numa_group->active_nodes = active_nodes;
16841689
}
16851690

16861691
/*
@@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
19711976
update_task_scan_period(p, fault_types[0], fault_types[1]);
19721977

19731978
if (p->numa_group) {
1974-
update_numa_active_node_mask(p->numa_group);
1979+
numa_group_count_active_nodes(p->numa_group);
19751980
spin_unlock_irq(group_lock);
19761981
max_nid = preferred_group_nid(p, max_group_nid);
19771982
}
@@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
20152020
return;
20162021

20172022
atomic_set(&grp->refcount, 1);
2023+
grp->active_nodes = 1;
2024+
grp->max_faults_cpu = 0;
20182025
spin_lock_init(&grp->lock);
20192026
grp->gid = p->pid;
20202027
/* Second half of the array tracks nids where faults happen */
20212028
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
20222029
nr_node_ids;
20232030

2024-
node_set(task_node(current), grp->active_nodes);
2025-
20262031
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
20272032
grp->faults[i] = p->numa_faults[i];
20282033

@@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
21362141
bool migrated = flags & TNF_MIGRATED;
21372142
int cpu_node = task_node(current);
21382143
int local = !!(flags & TNF_FAULT_LOCAL);
2144+
struct numa_group *ng;
21392145
int priv;
21402146

21412147
if (!static_branch_likely(&sched_numa_balancing))
@@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
21762182
* actively using should be counted as local. This allows the
21772183
* scan rate to slow down when a workload has settled down.
21782184
*/
2179-
if (!priv && !local && p->numa_group &&
2180-
node_isset(cpu_node, p->numa_group->active_nodes) &&
2181-
node_isset(mem_node, p->numa_group->active_nodes))
2185+
ng = p->numa_group;
2186+
if (!priv && !local && ng && ng->active_nodes > 1 &&
2187+
numa_is_active_node(cpu_node, ng) &&
2188+
numa_is_active_node(mem_node, ng))
21822189
local = 1;
21832190

21842191
task_numa_placement(p);

0 commit comments

Comments
 (0)