Skip to content

Commit 06808b0

Browse files
Lee Schermerhorntorvalds
authored andcommitted
hugetlb: derive huge pages nodes allowed from task mempolicy
This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [[email protected]: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes <[email protected]> Signed-off-by: Lee Schermerhorn <[email protected]> Acked-by: Mel Gorman <[email protected]> Reviewed-by: Andi Kleen <[email protected]> Cc: KAMEZAWA Hiroyuki <[email protected]> Cc: Randy Dunlap <[email protected]> Cc: Nishanth Aravamudan <[email protected]> Cc: Adam Litke <[email protected]> Cc: Andy Whitcroft <[email protected]> Cc: Eric Whitney <[email protected]> Cc: Christoph Lameter <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent c1e6c8d commit 06808b0

File tree

5 files changed

+153
-15
lines changed

5 files changed

+153
-15
lines changed

include/linux/hugetlb.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
2323
int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
2424
int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
2525
int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
26+
27+
#ifdef CONFIG_NUMA
28+
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
29+
void __user *, size_t *, loff_t *);
30+
#endif
31+
2632
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
2733
int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
2834
struct page **, struct vm_area_struct **,

include/linux/mempolicy.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
201201
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
202202
unsigned long addr, gfp_t gfp_flags,
203203
struct mempolicy **mpol, nodemask_t **nodemask);
204+
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
204205
extern unsigned slab_node(struct mempolicy *policy);
205206

206207
extern enum zone_type policy_zone;
@@ -328,6 +329,8 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
328329
return node_zonelist(0, gfp_flags);
329330
}
330331

332+
static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; }
333+
331334
static inline int do_migrate_pages(struct mm_struct *mm,
332335
const nodemask_t *from_nodes,
333336
const nodemask_t *to_nodes, int flags)

kernel/sysctl.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,15 +1051,26 @@ static struct ctl_table vm_table[] = {
10511051
.extra2 = &one_hundred,
10521052
},
10531053
#ifdef CONFIG_HUGETLB_PAGE
1054-
{
1054+
{
10551055
.procname = "nr_hugepages",
10561056
.data = NULL,
10571057
.maxlen = sizeof(unsigned long),
10581058
.mode = 0644,
10591059
.proc_handler = hugetlb_sysctl_handler,
10601060
.extra1 = (void *)&hugetlb_zero,
10611061
.extra2 = (void *)&hugetlb_infinity,
1062-
},
1062+
},
1063+
#ifdef CONFIG_NUMA
1064+
{
1065+
.procname = "nr_hugepages_mempolicy",
1066+
.data = NULL,
1067+
.maxlen = sizeof(unsigned long),
1068+
.mode = 0644,
1069+
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
1070+
.extra1 = (void *)&hugetlb_zero,
1071+
.extra2 = (void *)&hugetlb_infinity,
1072+
},
1073+
#endif
10631074
{
10641075
.procname = "hugetlb_shm_group",
10651076
.data = &sysctl_hugetlb_shm_group,

mm/hugetlb.c

Lines changed: 84 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,29 +1330,71 @@ static struct hstate *kobj_to_hstate(struct kobject *kobj)
13301330
return NULL;
13311331
}
13321332

1333-
static ssize_t nr_hugepages_show(struct kobject *kobj,
1333+
static ssize_t nr_hugepages_show_common(struct kobject *kobj,
13341334
struct kobj_attribute *attr, char *buf)
13351335
{
13361336
struct hstate *h = kobj_to_hstate(kobj);
13371337
return sprintf(buf, "%lu\n", h->nr_huge_pages);
13381338
}
1339-
static ssize_t nr_hugepages_store(struct kobject *kobj,
1340-
struct kobj_attribute *attr, const char *buf, size_t count)
1339+
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1340+
struct kobject *kobj, struct kobj_attribute *attr,
1341+
const char *buf, size_t len)
13411342
{
13421343
int err;
1343-
unsigned long input;
1344+
unsigned long count;
13441345
struct hstate *h = kobj_to_hstate(kobj);
1346+
NODEMASK_ALLOC(nodemask_t, nodes_allowed);
13451347

1346-
err = strict_strtoul(buf, 10, &input);
1348+
err = strict_strtoul(buf, 10, &count);
13471349
if (err)
13481350
return 0;
13491351

1350-
h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map);
1352+
if (!(obey_mempolicy && init_nodemask_of_mempolicy(nodes_allowed))) {
1353+
NODEMASK_FREE(nodes_allowed);
1354+
nodes_allowed = &node_online_map;
1355+
}
1356+
h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
13511357

1352-
return count;
1358+
if (nodes_allowed != &node_online_map)
1359+
NODEMASK_FREE(nodes_allowed);
1360+
1361+
return len;
1362+
}
1363+
1364+
static ssize_t nr_hugepages_show(struct kobject *kobj,
1365+
struct kobj_attribute *attr, char *buf)
1366+
{
1367+
return nr_hugepages_show_common(kobj, attr, buf);
1368+
}
1369+
1370+
static ssize_t nr_hugepages_store(struct kobject *kobj,
1371+
struct kobj_attribute *attr, const char *buf, size_t len)
1372+
{
1373+
return nr_hugepages_store_common(false, kobj, attr, buf, len);
13531374
}
13541375
HSTATE_ATTR(nr_hugepages);
13551376

1377+
#ifdef CONFIG_NUMA
1378+
1379+
/*
1380+
* hstate attribute for optionally mempolicy-based constraint on persistent
1381+
* huge page alloc/free.
1382+
*/
1383+
static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1384+
struct kobj_attribute *attr, char *buf)
1385+
{
1386+
return nr_hugepages_show_common(kobj, attr, buf);
1387+
}
1388+
1389+
static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1390+
struct kobj_attribute *attr, const char *buf, size_t len)
1391+
{
1392+
return nr_hugepages_store_common(true, kobj, attr, buf, len);
1393+
}
1394+
HSTATE_ATTR(nr_hugepages_mempolicy);
1395+
#endif
1396+
1397+
13561398
static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
13571399
struct kobj_attribute *attr, char *buf)
13581400
{
@@ -1408,6 +1450,9 @@ static struct attribute *hstate_attrs[] = {
14081450
&free_hugepages_attr.attr,
14091451
&resv_hugepages_attr.attr,
14101452
&surplus_hugepages_attr.attr,
1453+
#ifdef CONFIG_NUMA
1454+
&nr_hugepages_mempolicy_attr.attr,
1455+
#endif
14111456
NULL,
14121457
};
14131458

@@ -1574,9 +1619,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
15741619
}
15751620

15761621
#ifdef CONFIG_SYSCTL
1577-
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1578-
void __user *buffer,
1579-
size_t *length, loff_t *ppos)
1622+
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1623+
struct ctl_table *table, int write,
1624+
void __user *buffer, size_t *length, loff_t *ppos)
15801625
{
15811626
struct hstate *h = &default_hstate;
15821627
unsigned long tmp;
@@ -1588,13 +1633,39 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
15881633
table->maxlen = sizeof(unsigned long);
15891634
proc_doulongvec_minmax(table, write, buffer, length, ppos);
15901635

1591-
if (write)
1592-
h->max_huge_pages = set_max_huge_pages(h, tmp,
1593-
&node_online_map);
1636+
if (write) {
1637+
NODEMASK_ALLOC(nodemask_t, nodes_allowed);
1638+
if (!(obey_mempolicy &&
1639+
init_nodemask_of_mempolicy(nodes_allowed))) {
1640+
NODEMASK_FREE(nodes_allowed);
1641+
nodes_allowed = &node_states[N_HIGH_MEMORY];
1642+
}
1643+
h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1644+
1645+
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1646+
NODEMASK_FREE(nodes_allowed);
1647+
}
15941648

15951649
return 0;
15961650
}
15971651

1652+
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1653+
void __user *buffer, size_t *length, loff_t *ppos)
1654+
{
1655+
1656+
return hugetlb_sysctl_handler_common(false, table, write,
1657+
buffer, length, ppos);
1658+
}
1659+
1660+
#ifdef CONFIG_NUMA
1661+
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1662+
void __user *buffer, size_t *length, loff_t *ppos)
1663+
{
1664+
return hugetlb_sysctl_handler_common(true, table, write,
1665+
buffer, length, ppos);
1666+
}
1667+
#endif /* CONFIG_NUMA */
1668+
15981669
int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
15991670
void __user *buffer,
16001671
size_t *length, loff_t *ppos)

mm/mempolicy.c

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1568,6 +1568,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
15681568
}
15691569
return zl;
15701570
}
1571+
1572+
/*
1573+
* init_nodemask_of_mempolicy
1574+
*
1575+
* If the current task's mempolicy is "default" [NULL], return 'false'
1576+
* to indicate default policy. Otherwise, extract the policy nodemask
1577+
* for 'bind' or 'interleave' policy into the argument nodemask, or
1578+
* initialize the argument nodemask to contain the single node for
1579+
* 'preferred' or 'local' policy and return 'true' to indicate presence
1580+
* of non-default mempolicy.
1581+
*
1582+
* We don't bother with reference counting the mempolicy [mpol_get/put]
1583+
* because the current task is examining it's own mempolicy and a task's
1584+
* mempolicy is only ever changed by the task itself.
1585+
*
1586+
* N.B., it is the caller's responsibility to free a returned nodemask.
1587+
*/
1588+
bool init_nodemask_of_mempolicy(nodemask_t *mask)
1589+
{
1590+
struct mempolicy *mempolicy;
1591+
int nid;
1592+
1593+
if (!(mask && current->mempolicy))
1594+
return false;
1595+
1596+
mempolicy = current->mempolicy;
1597+
switch (mempolicy->mode) {
1598+
case MPOL_PREFERRED:
1599+
if (mempolicy->flags & MPOL_F_LOCAL)
1600+
nid = numa_node_id();
1601+
else
1602+
nid = mempolicy->v.preferred_node;
1603+
init_nodemask_of_node(mask, nid);
1604+
break;
1605+
1606+
case MPOL_BIND:
1607+
/* Fall through */
1608+
case MPOL_INTERLEAVE:
1609+
*mask = mempolicy->v.nodes;
1610+
break;
1611+
1612+
default:
1613+
BUG();
1614+
}
1615+
1616+
return true;
1617+
}
15711618
#endif
15721619

15731620
/* Allocate a page in interleaved policy.

0 commit comments

Comments
 (0)