Skip to content

Commit b538908

Browse files
ZhenguoYao1torvalds
authored andcommitted
hugetlbfs: extend the definition of hugepages parameter to support node allocation
We can specify the number of hugepages to allocate at boot. But the hugepages is balanced in all nodes at present. In some scenarios, we only need hugepages in one node. For example: DPDK needs hugepages which are in the same node as NIC. If DPDK needs four hugepages of 1G size in node1 and system has 16 numa nodes we must reserve 64 hugepages on the kernel cmdline. But only four hugepages are used. The others should be free after boot. If the system memory is low(for example: 64G), it will be an impossible task. So extend the hugepages parameter to support specifying hugepages on a specific node. For example add following parameter: hugepagesz=1G hugepages=0:1,1:3 It will allocate 1 hugepage in node0 and 3 hugepages in node1. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Zhenguo Yao <[email protected]> Reviewed-by: Mike Kravetz <[email protected]> Cc: Zhenguo Yao <[email protected]> Cc: Dan Carpenter <[email protected]> Cc: Nathan Chancellor <[email protected]> Cc: Michael Ellerman <[email protected]> Cc: Benjamin Herrenschmidt <[email protected]> Cc: Paul Mackerras <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Mike Rapoport <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 3723929 commit b538908

File tree

5 files changed

+155
-33
lines changed

5 files changed

+155
-33
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1601,9 +1601,11 @@
16011601
the number of pages of hugepagesz to be allocated.
16021602
If this is the first HugeTLB parameter on the command
16031603
line, it specifies the number of pages to allocate for
1604-
the default huge page size. See also
1605-
Documentation/admin-guide/mm/hugetlbpage.rst.
1606-
Format: <integer>
1604+
the default huge page size. If using node format, the
1605+
number of pages to allocate per-node can be specified.
1606+
See also Documentation/admin-guide/mm/hugetlbpage.rst.
1607+
Format: <integer> or (node format)
1608+
<node>:<integer>[,<node>:<integer>]
16071609

16081610
hugepagesz=
16091611
[HW] The size of the HugeTLB pages. This is used in

Documentation/admin-guide/mm/hugetlbpage.rst

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,9 @@ hugepages
128128
implicitly specifies the number of huge pages of default size to
129129
allocate. If the number of huge pages of default size is implicitly
130130
specified, it can not be overwritten by a hugepagesz,hugepages
131-
parameter pair for the default size.
131+
parameter pair for the default size. This parameter also has a
132+
node format. The node format specifies the number of huge pages
133+
to allocate on specific nodes.
132134

133135
For example, on an architecture with 2M default huge page size::
134136

@@ -138,6 +140,14 @@ hugepages
138140
indicating that the hugepages=512 parameter is ignored. If a hugepages
139141
parameter is preceded by an invalid hugepagesz parameter, it will
140142
be ignored.
143+
144+
Node format example::
145+
146+
hugepagesz=2M hugepages=0:1,1:2
147+
148+
It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1.
149+
If the node number is invalid, the parameter will be ignored.
150+
141151
default_hugepagesz
142152
Specify the default huge page size. This parameter can
143153
only be specified once on the command line. default_hugepagesz can

arch/powerpc/mm/hugetlbpage.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,17 +229,22 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
229229
m->hstate = hstate;
230230
return 1;
231231
}
232+
233+
bool __init hugetlb_node_alloc_supported(void)
234+
{
235+
return false;
236+
}
232237
#endif
233238

234239

235-
int __init alloc_bootmem_huge_page(struct hstate *h)
240+
int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
236241
{
237242

238243
#ifdef CONFIG_PPC_BOOK3S_64
239244
if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
240245
return pseries_alloc_bootmem_huge_page(h);
241246
#endif
242-
return __alloc_bootmem_huge_page(h);
247+
return __alloc_bootmem_huge_page(h, nid);
243248
}
244249

245250
#ifndef CONFIG_PPC_BOOK3S_64

include/linux/hugetlb.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,7 @@ struct hstate {
615615
unsigned long nr_overcommit_huge_pages;
616616
struct list_head hugepage_activelist;
617617
struct list_head hugepage_freelists[MAX_NUMNODES];
618+
unsigned int max_huge_pages_node[MAX_NUMNODES];
618619
unsigned int nr_huge_pages_node[MAX_NUMNODES];
619620
unsigned int free_huge_pages_node[MAX_NUMNODES];
620621
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
@@ -647,8 +648,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
647648
unsigned long address, struct page *page);
648649

649650
/* arch callback */
650-
int __init __alloc_bootmem_huge_page(struct hstate *h);
651-
int __init alloc_bootmem_huge_page(struct hstate *h);
651+
int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
652+
int __init alloc_bootmem_huge_page(struct hstate *h, int nid);
653+
bool __init hugetlb_node_alloc_supported(void);
652654

653655
void __init hugetlb_add_hstate(unsigned order);
654656
bool __init arch_hugetlb_valid_size(unsigned long size);

mm/hugetlb.c

Lines changed: 128 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ static struct hstate * __initdata parsed_hstate;
7777
static unsigned long __initdata default_hstate_max_huge_pages;
7878
static bool __initdata parsed_valid_hugepagesz = true;
7979
static bool __initdata parsed_default_hugepagesz;
80+
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
8081

8182
/*
8283
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -2963,33 +2964,39 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
29632964
return ERR_PTR(-ENOSPC);
29642965
}
29652966

2966-
int alloc_bootmem_huge_page(struct hstate *h)
2967+
int alloc_bootmem_huge_page(struct hstate *h, int nid)
29672968
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
2968-
int __alloc_bootmem_huge_page(struct hstate *h)
2969+
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
29692970
{
2970-
struct huge_bootmem_page *m;
2971+
struct huge_bootmem_page *m = NULL; /* initialize for clang */
29712972
int nr_nodes, node;
29722973

2974+
if (nid >= nr_online_nodes)
2975+
return 0;
2976+
/* do node specific alloc */
2977+
if (nid != NUMA_NO_NODE) {
2978+
m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
2979+
0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
2980+
if (!m)
2981+
return 0;
2982+
goto found;
2983+
}
2984+
/* allocate from next node when distributing huge pages */
29732985
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
2974-
void *addr;
2975-
2976-
addr = memblock_alloc_try_nid_raw(
2986+
m = memblock_alloc_try_nid_raw(
29772987
huge_page_size(h), huge_page_size(h),
29782988
0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
2979-
if (addr) {
2980-
/*
2981-
* Use the beginning of the huge page to store the
2982-
* huge_bootmem_page struct (until gather_bootmem
2983-
* puts them into the mem_map).
2984-
*/
2985-
m = addr;
2986-
goto found;
2987-
}
2989+
/*
2990+
* Use the beginning of the huge page to store the
2991+
* huge_bootmem_page struct (until gather_bootmem
2992+
* puts them into the mem_map).
2993+
*/
2994+
if (!m)
2995+
return 0;
2996+
goto found;
29882997
}
2989-
return 0;
29902998

29912999
found:
2992-
BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
29933000
/* Put them into a private list first because mem_map is not up yet */
29943001
INIT_LIST_HEAD(&m->list);
29953002
list_add(&m->list, &huge_boot_pages);
@@ -3029,12 +3036,61 @@ static void __init gather_bootmem_prealloc(void)
30293036
cond_resched();
30303037
}
30313038
}
3039+
static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3040+
{
3041+
unsigned long i;
3042+
char buf[32];
3043+
3044+
for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3045+
if (hstate_is_gigantic(h)) {
3046+
if (!alloc_bootmem_huge_page(h, nid))
3047+
break;
3048+
} else {
3049+
struct page *page;
3050+
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3051+
3052+
page = alloc_fresh_huge_page(h, gfp_mask, nid,
3053+
&node_states[N_MEMORY], NULL);
3054+
if (!page)
3055+
break;
3056+
put_page(page); /* free it into the hugepage allocator */
3057+
}
3058+
cond_resched();
3059+
}
3060+
if (i == h->max_huge_pages_node[nid])
3061+
return;
3062+
3063+
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3064+
pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
3065+
h->max_huge_pages_node[nid], buf, nid, i);
3066+
h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3067+
h->max_huge_pages_node[nid] = i;
3068+
}
30323069

30333070
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
30343071
{
30353072
unsigned long i;
30363073
nodemask_t *node_alloc_noretry;
3074+
bool node_specific_alloc = false;
30373075

3076+
/* skip gigantic hugepages allocation if hugetlb_cma enabled */
3077+
if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3078+
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3079+
return;
3080+
}
3081+
3082+
/* do node specific alloc */
3083+
for (i = 0; i < nr_online_nodes; i++) {
3084+
if (h->max_huge_pages_node[i] > 0) {
3085+
hugetlb_hstate_alloc_pages_onenode(h, i);
3086+
node_specific_alloc = true;
3087+
}
3088+
}
3089+
3090+
if (node_specific_alloc)
3091+
return;
3092+
3093+
/* below will do all node balanced alloc */
30383094
if (!hstate_is_gigantic(h)) {
30393095
/*
30403096
* Bit mask controlling how hard we retry per-node allocations.
@@ -3055,11 +3111,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
30553111

30563112
for (i = 0; i < h->max_huge_pages; ++i) {
30573113
if (hstate_is_gigantic(h)) {
3058-
if (hugetlb_cma_size) {
3059-
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3060-
goto free;
3061-
}
3062-
if (!alloc_bootmem_huge_page(h))
3114+
if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
30633115
break;
30643116
} else if (!alloc_pool_huge_page(h,
30653117
&node_states[N_MEMORY],
@@ -3075,7 +3127,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
30753127
h->max_huge_pages, buf, i);
30763128
h->max_huge_pages = i;
30773129
}
3078-
free:
30793130
kfree(node_alloc_noretry);
30803131
}
30813132

@@ -3990,6 +4041,10 @@ static int __init hugetlb_init(void)
39904041
}
39914042
default_hstate.max_huge_pages =
39924043
default_hstate_max_huge_pages;
4044+
4045+
for (i = 0; i < nr_online_nodes; i++)
4046+
default_hstate.max_huge_pages_node[i] =
4047+
default_hugepages_in_node[i];
39934048
}
39944049
}
39954050

@@ -4050,6 +4105,10 @@ void __init hugetlb_add_hstate(unsigned int order)
40504105
parsed_hstate = h;
40514106
}
40524107

4108+
bool __init __weak hugetlb_node_alloc_supported(void)
4109+
{
4110+
return true;
4111+
}
40534112
/*
40544113
* hugepages command line processing
40554114
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4061,6 +4120,10 @@ static int __init hugepages_setup(char *s)
40614120
{
40624121
unsigned long *mhp;
40634122
static unsigned long *last_mhp;
4123+
int node = NUMA_NO_NODE;
4124+
int count;
4125+
unsigned long tmp;
4126+
char *p = s;
40644127

40654128
if (!parsed_valid_hugepagesz) {
40664129
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
@@ -4084,8 +4147,40 @@ static int __init hugepages_setup(char *s)
40844147
return 0;
40854148
}
40864149

4087-
if (sscanf(s, "%lu", mhp) <= 0)
4088-
*mhp = 0;
4150+
while (*p) {
4151+
count = 0;
4152+
if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4153+
goto invalid;
4154+
/* Parameter is node format */
4155+
if (p[count] == ':') {
4156+
if (!hugetlb_node_alloc_supported()) {
4157+
pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4158+
return 0;
4159+
}
4160+
node = tmp;
4161+
p += count + 1;
4162+
if (node < 0 || node >= nr_online_nodes)
4163+
goto invalid;
4164+
/* Parse hugepages */
4165+
if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4166+
goto invalid;
4167+
if (!hugetlb_max_hstate)
4168+
default_hugepages_in_node[node] = tmp;
4169+
else
4170+
parsed_hstate->max_huge_pages_node[node] = tmp;
4171+
*mhp += tmp;
4172+
/* Go to parse next node*/
4173+
if (p[count] == ',')
4174+
p += count + 1;
4175+
else
4176+
break;
4177+
} else {
4178+
if (p != s)
4179+
goto invalid;
4180+
*mhp = tmp;
4181+
break;
4182+
}
4183+
}
40894184

40904185
/*
40914186
* Global state is always initialized later in hugetlb_init.
@@ -4098,6 +4193,10 @@ static int __init hugepages_setup(char *s)
40984193
last_mhp = mhp;
40994194

41004195
return 1;
4196+
4197+
invalid:
4198+
pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4199+
return 0;
41014200
}
41024201
__setup("hugepages=", hugepages_setup);
41034202

@@ -4159,6 +4258,7 @@ __setup("hugepagesz=", hugepagesz_setup);
41594258
static int __init default_hugepagesz_setup(char *s)
41604259
{
41614260
unsigned long size;
4261+
int i;
41624262

41634263
parsed_valid_hugepagesz = false;
41644264
if (parsed_default_hugepagesz) {
@@ -4187,6 +4287,9 @@ static int __init default_hugepagesz_setup(char *s)
41874287
*/
41884288
if (default_hstate_max_huge_pages) {
41894289
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4290+
for (i = 0; i < nr_online_nodes; i++)
4291+
default_hstate.max_huge_pages_node[i] =
4292+
default_hugepages_in_node[i];
41904293
if (hstate_is_gigantic(&default_hstate))
41914294
hugetlb_hstate_alloc_pages(&default_hstate);
41924295
default_hstate_max_huge_pages = 0;

0 commit comments

Comments
 (0)