@@ -932,10 +932,11 @@ struct numa_group {
932
932
spinlock_t lock ; /* nr_tasks, tasks */
933
933
int nr_tasks ;
934
934
pid_t gid ;
935
+ int active_nodes ;
935
936
936
937
struct rcu_head rcu ;
937
- nodemask_t active_nodes ;
938
938
unsigned long total_faults ;
939
+ unsigned long max_faults_cpu ;
939
940
/*
940
941
* Faults_cpu is used to decide whether memory should move
941
942
* towards the CPU. As a consequence, these stats are weighted
@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
994
995
group -> faults_cpu [task_faults_idx (NUMA_MEM , nid , 1 )];
995
996
}
996
997
998
+ /*
999
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
1000
+ * considered part of a numa group's pseudo-interleaving set. Migrations
1001
+ * between these nodes are slowed down, to allow things to settle down.
1002
+ */
1003
+ #define ACTIVE_NODE_FRACTION 3
1004
+
1005
+ static bool numa_is_active_node (int nid , struct numa_group * ng )
1006
+ {
1007
+ return group_faults_cpu (ng , nid ) * ACTIVE_NODE_FRACTION > ng -> max_faults_cpu ;
1008
+ }
1009
+
997
1010
/* Handle placement on systems where not all nodes are directly connected. */
998
1011
static unsigned long score_nearby_nodes (struct task_struct * p , int nid ,
999
1012
int maxdist , bool task )
@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1143
1156
return true;
1144
1157
1145
1158
/*
1146
- * Do not migrate if the destination is not a node that
1147
- * is actively used by this numa group .
1159
+ * Destination node is much more heavily used than the source
1160
+ * node? Allow migration .
1148
1161
*/
1149
- if (!node_isset (dst_nid , ng -> active_nodes ))
1150
- return false;
1151
-
1152
- /*
1153
- * Source is a node that is not actively used by this
1154
- * numa group, while the destination is. Migrate.
1155
- */
1156
- if (!node_isset (src_nid , ng -> active_nodes ))
1162
+ if (group_faults_cpu (ng , dst_nid ) > group_faults_cpu (ng , src_nid ) *
1163
+ ACTIVE_NODE_FRACTION )
1157
1164
return true;
1158
1165
1159
1166
/*
1160
- * Both source and destination are nodes in active
1161
- * use by this numa group. Maximize memory bandwidth
1162
- * by migrating from more heavily used groups, to less
1163
- * heavily used ones, spreading the load around.
1164
- * Use a 1/4 hysteresis to avoid spurious page movement.
1167
+ * Distribute memory according to CPU & memory use on each node,
1168
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
1169
+ *
1170
+ * faults_cpu(dst) 3 faults_cpu(src)
1171
+ * --------------- * - > ---------------
1172
+ * faults_mem(dst) 4 faults_mem(src)
1165
1173
*/
1166
- return group_faults (p , dst_nid ) < (group_faults (p , src_nid ) * 3 / 4 );
1174
+ return group_faults_cpu (ng , dst_nid ) * group_faults (p , src_nid ) * 3 >
1175
+ group_faults_cpu (ng , src_nid ) * group_faults (p , dst_nid ) * 4 ;
1167
1176
}
1168
1177
1169
1178
static unsigned long weighted_cpuload (const int cpu );
@@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
1509
1518
1510
1519
.best_task = NULL ,
1511
1520
.best_imp = 0 ,
1512
- .best_cpu = -1
1521
+ .best_cpu = -1 ,
1513
1522
};
1514
1523
struct sched_domain * sd ;
1515
1524
unsigned long taskweight , groupweight ;
@@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
1561
1570
* multiple NUMA nodes; in order to better consolidate the group,
1562
1571
* we need to check other locations.
1563
1572
*/
1564
- if (env .best_cpu == -1 || (p -> numa_group &&
1565
- nodes_weight (p -> numa_group -> active_nodes ) > 1 )) {
1573
+ if (env .best_cpu == -1 || (p -> numa_group && p -> numa_group -> active_nodes > 1 )) {
1566
1574
for_each_online_node (nid ) {
1567
1575
if (nid == env .src_nid || nid == p -> numa_preferred_nid )
1568
1576
continue ;
@@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
1597
1605
* trying for a better one later. Do not set the preferred node here.
1598
1606
*/
1599
1607
if (p -> numa_group ) {
1608
+ struct numa_group * ng = p -> numa_group ;
1609
+
1600
1610
if (env .best_cpu == -1 )
1601
1611
nid = env .src_nid ;
1602
1612
else
1603
1613
nid = env .dst_nid ;
1604
1614
1605
- if (node_isset ( nid , p -> numa_group -> active_nodes ))
1615
+ if (ng -> active_nodes > 1 && numa_is_active_node ( env . dst_nid , ng ))
1606
1616
sched_setnuma (p , env .dst_nid );
1607
1617
}
1608
1618
@@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
1652
1662
}
1653
1663
1654
1664
/*
1655
- * Find the nodes on which the workload is actively running. We do this by
1665
+ * Find out how many nodes on the workload is actively running on. Do this by
1656
1666
* tracking the nodes from which NUMA hinting faults are triggered. This can
1657
1667
* be different from the set of nodes where the workload's memory is currently
1658
1668
* located.
1659
- *
1660
- * The bitmask is used to make smarter decisions on when to do NUMA page
1661
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1662
- * are added when they cause over 6/16 of the maximum number of faults, but
1663
- * only removed when they drop below 3/16.
1664
1669
*/
1665
- static void update_numa_active_node_mask (struct numa_group * numa_group )
1670
+ static void numa_group_count_active_nodes (struct numa_group * numa_group )
1666
1671
{
1667
1672
unsigned long faults , max_faults = 0 ;
1668
- int nid ;
1673
+ int nid , active_nodes = 0 ;
1669
1674
1670
1675
for_each_online_node (nid ) {
1671
1676
faults = group_faults_cpu (numa_group , nid );
@@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
1675
1680
1676
1681
for_each_online_node (nid ) {
1677
1682
faults = group_faults_cpu (numa_group , nid );
1678
- if (!node_isset (nid , numa_group -> active_nodes )) {
1679
- if (faults > max_faults * 6 / 16 )
1680
- node_set (nid , numa_group -> active_nodes );
1681
- } else if (faults < max_faults * 3 / 16 )
1682
- node_clear (nid , numa_group -> active_nodes );
1683
+ if (faults * ACTIVE_NODE_FRACTION > max_faults )
1684
+ active_nodes ++ ;
1683
1685
}
1686
+
1687
+ numa_group -> max_faults_cpu = max_faults ;
1688
+ numa_group -> active_nodes = active_nodes ;
1684
1689
}
1685
1690
1686
1691
/*
@@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
1971
1976
update_task_scan_period (p , fault_types [0 ], fault_types [1 ]);
1972
1977
1973
1978
if (p -> numa_group ) {
1974
- update_numa_active_node_mask (p -> numa_group );
1979
+ numa_group_count_active_nodes (p -> numa_group );
1975
1980
spin_unlock_irq (group_lock );
1976
1981
max_nid = preferred_group_nid (p , max_group_nid );
1977
1982
}
@@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2015
2020
return ;
2016
2021
2017
2022
atomic_set (& grp -> refcount , 1 );
2023
+ grp -> active_nodes = 1 ;
2024
+ grp -> max_faults_cpu = 0 ;
2018
2025
spin_lock_init (& grp -> lock );
2019
2026
grp -> gid = p -> pid ;
2020
2027
/* Second half of the array tracks nids where faults happen */
2021
2028
grp -> faults_cpu = grp -> faults + NR_NUMA_HINT_FAULT_TYPES *
2022
2029
nr_node_ids ;
2023
2030
2024
- node_set (task_node (current ), grp -> active_nodes );
2025
-
2026
2031
for (i = 0 ; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids ; i ++ )
2027
2032
grp -> faults [i ] = p -> numa_faults [i ];
2028
2033
@@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2136
2141
bool migrated = flags & TNF_MIGRATED ;
2137
2142
int cpu_node = task_node (current );
2138
2143
int local = !!(flags & TNF_FAULT_LOCAL );
2144
+ struct numa_group * ng ;
2139
2145
int priv ;
2140
2146
2141
2147
if (!static_branch_likely (& sched_numa_balancing ))
@@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2176
2182
* actively using should be counted as local. This allows the
2177
2183
* scan rate to slow down when a workload has settled down.
2178
2184
*/
2179
- if (!priv && !local && p -> numa_group &&
2180
- node_isset (cpu_node , p -> numa_group -> active_nodes ) &&
2181
- node_isset (mem_node , p -> numa_group -> active_nodes ))
2185
+ ng = p -> numa_group ;
2186
+ if (!priv && !local && ng && ng -> active_nodes > 1 &&
2187
+ numa_is_active_node (cpu_node , ng ) &&
2188
+ numa_is_active_node (mem_node , ng ))
2182
2189
local = 1 ;
2183
2190
2184
2191
task_numa_placement (p );
0 commit comments