Skip to content

Commit f41f2ed

Browse files
Muchun Songtorvalds
authored andcommitted
mm: hugetlb: free the vmemmap pages associated with each HugeTLB page
Every HugeTLB has more than one struct page structure. We __know__ that we only use the first 4 (__NR_USED_SUBPAGE) struct page structures to store metadata associated with each HugeTLB. There are a lot of struct page structures associated with each HugeTLB page. For tail pages, the value of compound_head is the same. So we can reuse first page of tail page structures. We map the virtual addresses of the remaining pages of tail page structures to the first tail page struct, and then free these page frames. Therefore, we need to reserve two pages as vmemmap areas. When we allocate a HugeTLB page from the buddy, we can free some vmemmap pages associated with each HugeTLB page. It is more appropriate to do it in the prep_new_huge_page(). The free_vmemmap_pages_per_hpage(), which indicates how many vmemmap pages associated with a HugeTLB page can be freed, returns zero for now, which means the feature is disabled. We will enable it once all the infrastructure is there. [[email protected]: fix documentation warning] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Muchun Song <[email protected]> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]> Reviewed-by: Oscar Salvador <[email protected]> Tested-by: Chen Huang <[email protected]> Tested-by: Bodeddula Balasubramaniam <[email protected]> Acked-by: Michal Hocko <[email protected]> Reviewed-by: Mike Kravetz <[email protected]> Cc: Alexander Viro <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Anshuman Khandual <[email protected]> Cc: Balbir Singh <[email protected]> Cc: Barry Song <[email protected]> Cc: Borislav Petkov <[email protected]> Cc: Dave Hansen <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: David Rientjes <[email protected]> Cc: HORIGUCHI NAOYA <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Joao Martins <[email protected]> Cc: Joerg Roedel <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Miaohe Lin <[email protected]> Cc: Mina Almasry <[email protected]> Cc: Oliver Neukum <[email protected]> Cc: Paul E. McKenney <[email protected]> Cc: Pawan Gupta <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Randy Dunlap <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Xiongchun Duan <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent cd39d4e commit f41f2ed

File tree

7 files changed

+473
-13
lines changed

7 files changed

+473
-13
lines changed

include/linux/bootmem_info.h

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#ifndef __LINUX_BOOTMEM_INFO_H
33
#define __LINUX_BOOTMEM_INFO_H
44

5-
#include <linux/mmzone.h>
5+
#include <linux/mm.h>
66

77
/*
88
* Types for free bootmem stored in page->lru.next. These have to be in
@@ -22,6 +22,27 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
2222
void get_page_bootmem(unsigned long info, struct page *page,
2323
unsigned long type);
2424
void put_page_bootmem(struct page *page);
25+
26+
/*
27+
* Any memory allocated via the memblock allocator and not via the
28+
* buddy will be marked reserved already in the memmap. For those
29+
* pages, we can call this function to free it to buddy allocator.
30+
*/
31+
static inline void free_bootmem_page(struct page *page)
32+
{
33+
unsigned long magic = (unsigned long)page->freelist;
34+
35+
/*
36+
* The reserve_bootmem_region sets the reserved flag on bootmem
37+
* pages.
38+
*/
39+
VM_BUG_ON_PAGE(page_ref_count(page) != 2, page);
40+
41+
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
42+
put_page_bootmem(page);
43+
else
44+
VM_BUG_ON_PAGE(1, page);
45+
}
2546
#else
2647
static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
2748
{
@@ -35,6 +56,11 @@ static inline void get_page_bootmem(unsigned long info, struct page *page,
3556
unsigned long type)
3657
{
3758
}
59+
60+
static inline void free_bootmem_page(struct page *page)
61+
{
62+
free_reserved_page(page);
63+
}
3864
#endif
3965

4066
#endif /* __LINUX_BOOTMEM_INFO_H */

include/linux/mm.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3076,6 +3076,9 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
30763076
}
30773077
#endif
30783078

3079+
void vmemmap_remap_free(unsigned long start, unsigned long end,
3080+
unsigned long reuse);
3081+
30793082
void *sparse_buffer_alloc(unsigned long size);
30803083
struct page * __populate_section_memmap(unsigned long pfn,
30813084
unsigned long nr_pages, int nid, struct vmem_altmap *altmap);

mm/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
7575
obj-$(CONFIG_ZSWAP) += zswap.o
7676
obj-$(CONFIG_HAS_DMA) += dmapool.o
7777
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
78+
obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o
7879
obj-$(CONFIG_NUMA) += mempolicy.o
7980
obj-$(CONFIG_SPARSEMEM) += sparse.o
8081
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o

mm/hugetlb.c

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <linux/node.h>
4242
#include <linux/page_owner.h>
4343
#include "internal.h"
44+
#include "hugetlb_vmemmap.h"
4445

4546
int hugetlb_max_hstate __read_mostly;
4647
unsigned int default_hstate_idx;
@@ -1493,8 +1494,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
14931494
h->nr_huge_pages_node[nid]++;
14941495
}
14951496

1496-
static void __prep_new_huge_page(struct page *page)
1497+
static void __prep_new_huge_page(struct hstate *h, struct page *page)
14971498
{
1499+
free_huge_page_vmemmap(h, page);
14981500
INIT_LIST_HEAD(&page->lru);
14991501
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
15001502
hugetlb_set_page_subpool(page, NULL);
@@ -1504,7 +1506,7 @@ static void __prep_new_huge_page(struct page *page)
15041506

15051507
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
15061508
{
1507-
__prep_new_huge_page(page);
1509+
__prep_new_huge_page(h, page);
15081510
spin_lock_irq(&hugetlb_lock);
15091511
__prep_account_new_huge_page(h, nid);
15101512
spin_unlock_irq(&hugetlb_lock);
@@ -2351,14 +2353,15 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
23512353

23522354
/*
23532355
* Before dissolving the page, we need to allocate a new one for the
2354-
* pool to remain stable. Using alloc_buddy_huge_page() allows us to
2355-
* not having to deal with prep_new_huge_page() and avoids dealing of any
2356-
* counters. This simplifies and let us do the whole thing under the
2357-
* lock.
2356+
* pool to remain stable. Here, we allocate the page and 'prep' it
2357+
* by doing everything but actually updating counters and adding to
2358+
* the pool. This simplifies and let us do most of the processing
2359+
* under the lock.
23582360
*/
23592361
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
23602362
if (!new_page)
23612363
return -ENOMEM;
2364+
__prep_new_huge_page(h, new_page);
23622365

23632366
retry:
23642367
spin_lock_irq(&hugetlb_lock);
@@ -2397,14 +2400,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
23972400
remove_hugetlb_page(h, old_page, false);
23982401

23992402
/*
2400-
* new_page needs to be initialized with the standard hugetlb
2401-
* state. This is normally done by prep_new_huge_page() but
2402-
* that takes hugetlb_lock which is already held so we need to
2403-
* open code it here.
24042403
* Reference count trick is needed because allocator gives us
24052404
* referenced page but the pool requires pages with 0 refcount.
24062405
*/
2407-
__prep_new_huge_page(new_page);
24082406
__prep_account_new_huge_page(h, nid);
24092407
page_ref_dec(new_page);
24102408
enqueue_huge_page(h, new_page);
@@ -2420,7 +2418,7 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
24202418

24212419
free_new:
24222420
spin_unlock_irq(&hugetlb_lock);
2423-
__free_pages(new_page, huge_page_order(h));
2421+
update_and_free_page(h, new_page);
24242422

24252423
return ret;
24262424
}

mm/hugetlb_vmemmap.c

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Free some vmemmap pages of HugeTLB
4+
*
5+
* Copyright (c) 2020, Bytedance. All rights reserved.
6+
*
7+
* Author: Muchun Song <[email protected]>
8+
*
9+
* The struct page structures (page structs) are used to describe a physical
10+
* page frame. By default, there is a one-to-one mapping from a page frame to
11+
* it's corresponding page struct.
12+
*
13+
* HugeTLB pages consist of multiple base page size pages and is supported by
14+
* many architectures. See hugetlbpage.rst in the Documentation directory for
15+
* more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB
16+
* are currently supported. Since the base page size on x86 is 4KB, a 2MB
17+
* HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
18+
* 4096 base pages. For each base page, there is a corresponding page struct.
19+
*
20+
* Within the HugeTLB subsystem, only the first 4 page structs are used to
21+
* contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
22+
* this upper limit. The only 'useful' information in the remaining page structs
23+
* is the compound_head field, and this field is the same for all tail pages.
24+
*
25+
* By removing redundant page structs for HugeTLB pages, memory can be returned
26+
* to the buddy allocator for other uses.
27+
*
28+
* Different architectures support different HugeTLB pages. For example, the
29+
* following table is the HugeTLB page size supported by x86 and arm64
30+
* architectures. Because arm64 supports 4k, 16k, and 64k base pages and
31+
* supports contiguous entries, so it supports many kinds of sizes of HugeTLB
32+
* page.
33+
*
34+
* +--------------+-----------+-----------------------------------------------+
35+
* | Architecture | Page Size | HugeTLB Page Size |
36+
* +--------------+-----------+-----------+-----------+-----------+-----------+
37+
* | x86-64 | 4KB | 2MB | 1GB | | |
38+
* +--------------+-----------+-----------+-----------+-----------+-----------+
39+
* | | 4KB | 64KB | 2MB | 32MB | 1GB |
40+
* | +-----------+-----------+-----------+-----------+-----------+
41+
* | arm64 | 16KB | 2MB | 32MB | 1GB | |
42+
* | +-----------+-----------+-----------+-----------+-----------+
43+
* | | 64KB | 2MB | 512MB | 16GB | |
44+
* +--------------+-----------+-----------+-----------+-----------+-----------+
45+
*
46+
* When the system boot up, every HugeTLB page has more than one struct page
47+
* structs which size is (unit: pages):
48+
*
49+
* struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
50+
*
51+
* Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
52+
* of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
53+
* relationship.
54+
*
55+
* HugeTLB_Size = n * PAGE_SIZE
56+
*
57+
* Then,
58+
*
59+
* struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
60+
* = n * sizeof(struct page) / PAGE_SIZE
61+
*
62+
* We can use huge mapping at the pud/pmd level for the HugeTLB page.
63+
*
64+
* For the HugeTLB page of the pmd level mapping, then
65+
*
66+
* struct_size = n * sizeof(struct page) / PAGE_SIZE
67+
* = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
68+
* = sizeof(struct page) / sizeof(pte_t)
69+
* = 64 / 8
70+
* = 8 (pages)
71+
*
72+
* Where n is how many pte entries which one page can contains. So the value of
73+
* n is (PAGE_SIZE / sizeof(pte_t)).
74+
*
75+
* This optimization only supports 64-bit system, so the value of sizeof(pte_t)
76+
* is 8. And this optimization also applicable only when the size of struct page
77+
* is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
78+
* x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
79+
* size of struct page structs of it is 8 page frames which size depends on the
80+
* size of the base page.
81+
*
82+
* For the HugeTLB page of the pud level mapping, then
83+
*
84+
* struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
85+
* = PAGE_SIZE / 8 * 8 (pages)
86+
* = PAGE_SIZE (pages)
87+
*
88+
* Where the struct_size(pmd) is the size of the struct page structs of a
89+
* HugeTLB page of the pmd level mapping.
90+
*
91+
* E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
92+
* HugeTLB page consists in 4096.
93+
*
94+
* Next, we take the pmd level mapping of the HugeTLB page as an example to
95+
* show the internal implementation of this optimization. There are 8 pages
96+
* struct page structs associated with a HugeTLB page which is pmd mapped.
97+
*
98+
* Here is how things look before optimization.
99+
*
100+
* HugeTLB struct pages(8 pages) page frame(8 pages)
101+
* +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
102+
* | | | 0 | -------------> | 0 |
103+
* | | +-----------+ +-----------+
104+
* | | | 1 | -------------> | 1 |
105+
* | | +-----------+ +-----------+
106+
* | | | 2 | -------------> | 2 |
107+
* | | +-----------+ +-----------+
108+
* | | | 3 | -------------> | 3 |
109+
* | | +-----------+ +-----------+
110+
* | | | 4 | -------------> | 4 |
111+
* | PMD | +-----------+ +-----------+
112+
* | level | | 5 | -------------> | 5 |
113+
* | mapping | +-----------+ +-----------+
114+
* | | | 6 | -------------> | 6 |
115+
* | | +-----------+ +-----------+
116+
* | | | 7 | -------------> | 7 |
117+
* | | +-----------+ +-----------+
118+
* | |
119+
* | |
120+
* | |
121+
* +-----------+
122+
*
123+
* The value of page->compound_head is the same for all tail pages. The first
124+
* page of page structs (page 0) associated with the HugeTLB page contains the 4
125+
* page structs necessary to describe the HugeTLB. The only use of the remaining
126+
* pages of page structs (page 1 to page 7) is to point to page->compound_head.
127+
* Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
128+
* will be used for each HugeTLB page. This will allow us to free the remaining
129+
* 6 pages to the buddy allocator.
130+
*
131+
* Here is how things look after remapping.
132+
*
133+
* HugeTLB struct pages(8 pages) page frame(8 pages)
134+
* +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
135+
* | | | 0 | -------------> | 0 |
136+
* | | +-----------+ +-----------+
137+
* | | | 1 | -------------> | 1 |
138+
* | | +-----------+ +-----------+
139+
* | | | 2 | ----------------^ ^ ^ ^ ^ ^
140+
* | | +-----------+ | | | | |
141+
* | | | 3 | ------------------+ | | | |
142+
* | | +-----------+ | | | |
143+
* | | | 4 | --------------------+ | | |
144+
* | PMD | +-----------+ | | |
145+
* | level | | 5 | ----------------------+ | |
146+
* | mapping | +-----------+ | |
147+
* | | | 6 | ------------------------+ |
148+
* | | +-----------+ |
149+
* | | | 7 | --------------------------+
150+
* | | +-----------+
151+
* | |
152+
* | |
153+
* | |
154+
* +-----------+
155+
*
156+
* When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
157+
* vmemmap pages and restore the previous mapping relationship.
158+
*
159+
* For the HugeTLB page of the pud level mapping. It is similar to the former.
160+
* We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
161+
*
162+
* Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
163+
* (e.g. aarch64) provides a contiguous bit in the translation table entries
164+
* that hints to the MMU to indicate that it is one of a contiguous set of
165+
* entries that can be cached in a single TLB entry.
166+
*
167+
* The contiguous bit is used to increase the mapping size at the pmd and pte
168+
* (last) level. So this type of HugeTLB page can be optimized only when its
169+
* size of the struct page structs is greater than 2 pages.
170+
*/
171+
#include "hugetlb_vmemmap.h"
172+
173+
/*
174+
* There are a lot of struct page structures associated with each HugeTLB page.
175+
* For tail pages, the value of compound_head is the same. So we can reuse first
176+
* page of tail page structures. We map the virtual addresses of the remaining
177+
* pages of tail page structures to the first tail page struct, and then free
178+
* these page frames. Therefore, we need to reserve two pages as vmemmap areas.
179+
*/
180+
#define RESERVE_VMEMMAP_NR 2U
181+
#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
182+
183+
/*
184+
* How many vmemmap pages associated with a HugeTLB page that can be freed
185+
* to the buddy allocator.
186+
*
187+
* Todo: Returns zero for now, which means the feature is disabled. We will
188+
* enable it once all the infrastructure is there.
189+
*/
190+
static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
191+
{
192+
return 0;
193+
}
194+
195+
static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
196+
{
197+
return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
198+
}
199+
200+
void free_huge_page_vmemmap(struct hstate *h, struct page *head)
201+
{
202+
unsigned long vmemmap_addr = (unsigned long)head;
203+
unsigned long vmemmap_end, vmemmap_reuse;
204+
205+
if (!free_vmemmap_pages_per_hpage(h))
206+
return;
207+
208+
vmemmap_addr += RESERVE_VMEMMAP_SIZE;
209+
vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
210+
vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
211+
212+
/*
213+
* Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
214+
* to the page which @vmemmap_reuse is mapped to, then free the pages
215+
* which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
216+
*/
217+
vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse);
218+
}

mm/hugetlb_vmemmap.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Free some vmemmap pages of HugeTLB
4+
*
5+
* Copyright (c) 2020, Bytedance. All rights reserved.
6+
*
7+
* Author: Muchun Song <[email protected]>
8+
*/
9+
#ifndef _LINUX_HUGETLB_VMEMMAP_H
10+
#define _LINUX_HUGETLB_VMEMMAP_H
11+
#include <linux/hugetlb.h>
12+
13+
#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
14+
void free_huge_page_vmemmap(struct hstate *h, struct page *head);
15+
#else
16+
static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
17+
{
18+
}
19+
#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
20+
#endif /* _LINUX_HUGETLB_VMEMMAP_H */

0 commit comments

Comments
 (0)