Skip to content

Commit 9a98225

Browse files
kiryltorvalds
authored andcommitted
thp: introduce deferred_split_huge_page()
Currently we don't split huge page on partial unmap. It's not an ideal situation. It can lead to memory overhead. Furtunately, we can detect partial unmap on page_remove_rmap(). But we cannot call split_huge_page() from there due to locking context. It's also counterproductive to do directly from munmap() codepath: in many cases we will hit this from exit(2) and splitting the huge page just to free it up in small pages is not what we really want. The patch introduce deferred_split_huge_page() which put the huge page into queue for splitting. The splitting itself will happen when we get memory pressure via shrinker interface. The page will be dropped from list on freeing through compound page destructor. Signed-off-by: Kirill A. Shutemov <[email protected]> Tested-by: Sasha Levin <[email protected]> Tested-by: Aneesh Kumar K.V <[email protected]> Acked-by: Vlastimil Babka <[email protected]> Acked-by: Jerome Marchand <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Naoya Horiguchi <[email protected]> Cc: Steve Capper <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Christoph Lameter <[email protected]> Cc: David Rientjes <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 248db92 commit 9a98225

File tree

7 files changed

+174
-12
lines changed

7 files changed

+174
-12
lines changed

include/linux/huge_mm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,15 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
9090

9191
extern unsigned long transparent_hugepage_flags;
9292

93+
extern void prep_transhuge_page(struct page *page);
94+
extern void free_transhuge_page(struct page *page);
95+
9396
int split_huge_page_to_list(struct page *page, struct list_head *list);
9497
static inline int split_huge_page(struct page *page)
9598
{
9699
return split_huge_page_to_list(page, NULL);
97100
}
101+
void deferred_split_huge_page(struct page *page);
98102

99103
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
100104
unsigned long address);
@@ -170,6 +174,7 @@ static inline int split_huge_page(struct page *page)
170174
{
171175
return 0;
172176
}
177+
static inline void deferred_split_huge_page(struct page *page) {}
173178
#define split_huge_pmd(__vma, __pmd, __address) \
174179
do { } while (0)
175180
static inline int hugepage_madvise(struct vm_area_struct *vma,

include/linux/mm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,9 @@ enum compound_dtor_id {
507507
COMPOUND_PAGE_DTOR,
508508
#ifdef CONFIG_HUGETLB_PAGE
509509
HUGETLB_PAGE_DTOR,
510+
#endif
511+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
512+
TRANSHUGE_PAGE_DTOR,
510513
#endif
511514
NR_COMPOUND_DTORS,
512515
};
@@ -537,6 +540,8 @@ static inline void set_compound_order(struct page *page, unsigned int order)
537540
page[1].compound_order = order;
538541
}
539542

543+
void free_compound_page(struct page *page);
544+
540545
#ifdef CONFIG_MMU
541546
/*
542547
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when

include/linux/mm_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,15 @@ struct page {
5555
*/
5656
void *s_mem; /* slab first object */
5757
atomic_t compound_mapcount; /* first tail page */
58+
/* page_deferred_list().next -- second tail page */
5859
};
5960

6061
/* Second double word */
6162
struct {
6263
union {
6364
pgoff_t index; /* Our offset within mapping. */
6465
void *freelist; /* sl[aou]b first free object */
66+
/* page_deferred_list().prev -- second tail page */
6567
};
6668

6769
union {

mm/huge_memory.c

Lines changed: 135 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ static struct khugepaged_scan khugepaged_scan = {
135135
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
136136
};
137137

138+
static DEFINE_SPINLOCK(split_queue_lock);
139+
static LIST_HEAD(split_queue);
140+
static unsigned long split_queue_len;
141+
static struct shrinker deferred_split_shrinker;
138142

139143
static void set_recommended_min_free_kbytes(void)
140144
{
@@ -667,6 +671,9 @@ static int __init hugepage_init(void)
667671
err = register_shrinker(&huge_zero_page_shrinker);
668672
if (err)
669673
goto err_hzp_shrinker;
674+
err = register_shrinker(&deferred_split_shrinker);
675+
if (err)
676+
goto err_split_shrinker;
670677

671678
/*
672679
* By default disable transparent hugepages on smaller systems,
@@ -684,6 +691,8 @@ static int __init hugepage_init(void)
684691

685692
return 0;
686693
err_khugepaged:
694+
unregister_shrinker(&deferred_split_shrinker);
695+
err_split_shrinker:
687696
unregister_shrinker(&huge_zero_page_shrinker);
688697
err_hzp_shrinker:
689698
khugepaged_slab_exit();
@@ -740,6 +749,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
740749
return entry;
741750
}
742751

752+
static inline struct list_head *page_deferred_list(struct page *page)
753+
{
754+
/*
755+
* ->lru in the tail pages is occupied by compound_head.
756+
* Let's use ->mapping + ->index in the second tail page as list_head.
757+
*/
758+
return (struct list_head *)&page[2].mapping;
759+
}
760+
761+
void prep_transhuge_page(struct page *page)
762+
{
763+
/*
764+
* we use page->mapping and page->indexlru in second tail page
765+
* as list_head: assuming THP order >= 2
766+
*/
767+
BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
768+
769+
INIT_LIST_HEAD(page_deferred_list(page));
770+
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
771+
}
772+
743773
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
744774
struct vm_area_struct *vma,
745775
unsigned long address, pmd_t *pmd,
@@ -896,6 +926,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
896926
count_vm_event(THP_FAULT_FALLBACK);
897927
return VM_FAULT_FALLBACK;
898928
}
929+
prep_transhuge_page(page);
899930
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
900931
flags);
901932
}
@@ -1192,7 +1223,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
11921223
} else
11931224
new_page = NULL;
11941225

1195-
if (unlikely(!new_page)) {
1226+
if (likely(new_page)) {
1227+
prep_transhuge_page(new_page);
1228+
} else {
11961229
if (!page) {
11971230
split_huge_pmd(vma, pmd, address);
11981231
ret |= VM_FAULT_FALLBACK;
@@ -2109,6 +2142,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
21092142
return NULL;
21102143
}
21112144

2145+
prep_transhuge_page(*hpage);
21122146
count_vm_event(THP_COLLAPSE_ALLOC);
21132147
return *hpage;
21142148
}
@@ -2120,8 +2154,12 @@ static int khugepaged_find_target_node(void)
21202154

21212155
static inline struct page *alloc_hugepage(int defrag)
21222156
{
2123-
return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
2124-
HPAGE_PMD_ORDER);
2157+
struct page *page;
2158+
2159+
page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
2160+
if (page)
2161+
prep_transhuge_page(page);
2162+
return page;
21252163
}
21262164

21272165
static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -3098,7 +3136,7 @@ static int __split_huge_page_tail(struct page *head, int tail,
30983136
set_page_idle(page_tail);
30993137

31003138
/* ->mapping in first tail page is compound_mapcount */
3101-
VM_BUG_ON_PAGE(tail != 1 && page_tail->mapping != TAIL_MAPPING,
3139+
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
31023140
page_tail);
31033141
page_tail->mapping = head->mapping;
31043142

@@ -3207,19 +3245,28 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
32073245
freeze_page(anon_vma, head);
32083246
VM_BUG_ON_PAGE(compound_mapcount(head), head);
32093247

3248+
/* Prevent deferred_split_scan() touching ->_count */
3249+
spin_lock(&split_queue_lock);
32103250
count = page_count(head);
32113251
mapcount = total_mapcount(head);
32123252
if (mapcount == count - 1) {
3253+
if (!list_empty(page_deferred_list(head))) {
3254+
split_queue_len--;
3255+
list_del(page_deferred_list(head));
3256+
}
3257+
spin_unlock(&split_queue_lock);
32133258
__split_huge_page(page, list);
32143259
ret = 0;
32153260
} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
3261+
spin_unlock(&split_queue_lock);
32163262
pr_alert("total_mapcount: %u, page_count(): %u\n",
32173263
mapcount, count);
32183264
if (PageTail(page))
32193265
dump_page(head, NULL);
32203266
dump_page(page, "total_mapcount(head) > page_count(head) - 1");
32213267
BUG();
32223268
} else {
3269+
spin_unlock(&split_queue_lock);
32233270
unfreeze_page(anon_vma, head);
32243271
ret = -EBUSY;
32253272
}
@@ -3231,3 +3278,87 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
32313278
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
32323279
return ret;
32333280
}
3281+
3282+
void free_transhuge_page(struct page *page)
3283+
{
3284+
unsigned long flags;
3285+
3286+
spin_lock_irqsave(&split_queue_lock, flags);
3287+
if (!list_empty(page_deferred_list(page))) {
3288+
split_queue_len--;
3289+
list_del(page_deferred_list(page));
3290+
}
3291+
spin_unlock_irqrestore(&split_queue_lock, flags);
3292+
free_compound_page(page);
3293+
}
3294+
3295+
void deferred_split_huge_page(struct page *page)
3296+
{
3297+
unsigned long flags;
3298+
3299+
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3300+
3301+
spin_lock_irqsave(&split_queue_lock, flags);
3302+
if (list_empty(page_deferred_list(page))) {
3303+
list_add_tail(page_deferred_list(page), &split_queue);
3304+
split_queue_len++;
3305+
}
3306+
spin_unlock_irqrestore(&split_queue_lock, flags);
3307+
}
3308+
3309+
static unsigned long deferred_split_count(struct shrinker *shrink,
3310+
struct shrink_control *sc)
3311+
{
3312+
/*
3313+
* Split a page from split_queue will free up at least one page,
3314+
* at most HPAGE_PMD_NR - 1. We don't track exact number.
3315+
* Let's use HPAGE_PMD_NR / 2 as ballpark.
3316+
*/
3317+
return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
3318+
}
3319+
3320+
static unsigned long deferred_split_scan(struct shrinker *shrink,
3321+
struct shrink_control *sc)
3322+
{
3323+
unsigned long flags;
3324+
LIST_HEAD(list), *pos, *next;
3325+
struct page *page;
3326+
int split = 0;
3327+
3328+
spin_lock_irqsave(&split_queue_lock, flags);
3329+
list_splice_init(&split_queue, &list);
3330+
3331+
/* Take pin on all head pages to avoid freeing them under us */
3332+
list_for_each_safe(pos, next, &list) {
3333+
page = list_entry((void *)pos, struct page, mapping);
3334+
page = compound_head(page);
3335+
/* race with put_compound_page() */
3336+
if (!get_page_unless_zero(page)) {
3337+
list_del_init(page_deferred_list(page));
3338+
split_queue_len--;
3339+
}
3340+
}
3341+
spin_unlock_irqrestore(&split_queue_lock, flags);
3342+
3343+
list_for_each_safe(pos, next, &list) {
3344+
page = list_entry((void *)pos, struct page, mapping);
3345+
lock_page(page);
3346+
/* split_huge_page() removes page from list on success */
3347+
if (!split_huge_page(page))
3348+
split++;
3349+
unlock_page(page);
3350+
put_page(page);
3351+
}
3352+
3353+
spin_lock_irqsave(&split_queue_lock, flags);
3354+
list_splice_tail(&list, &split_queue);
3355+
spin_unlock_irqrestore(&split_queue_lock, flags);
3356+
3357+
return split * HPAGE_PMD_NR / 2;
3358+
}
3359+
3360+
static struct shrinker deferred_split_shrinker = {
3361+
.count_objects = deferred_split_count,
3362+
.scan_objects = deferred_split_scan,
3363+
.seeks = DEFAULT_SEEKS,
3364+
};

mm/migrate.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1760,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
17601760
HPAGE_PMD_ORDER);
17611761
if (!new_page)
17621762
goto out_fail;
1763+
prep_transhuge_page(new_page);
17631764

17641765
isolated = numamigrate_isolate_page(pgdat, page);
17651766
if (!isolated) {

mm/page_alloc.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,13 +222,15 @@ static char * const zone_names[MAX_NR_ZONES] = {
222222
#endif
223223
};
224224

225-
static void free_compound_page(struct page *page);
226225
compound_page_dtor * const compound_page_dtors[] = {
227226
NULL,
228227
free_compound_page,
229228
#ifdef CONFIG_HUGETLB_PAGE
230229
free_huge_page,
231230
#endif
231+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
232+
free_transhuge_page,
233+
#endif
232234
};
233235

234236
int min_free_kbytes = 1024;
@@ -450,7 +452,7 @@ static void bad_page(struct page *page, const char *reason,
450452
* This usage means that zero-order pages may not be compound.
451453
*/
452454

453-
static void free_compound_page(struct page *page)
455+
void free_compound_page(struct page *page)
454456
{
455457
__free_pages_ok(page, compound_order(page));
456458
}
@@ -858,15 +860,26 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
858860
ret = 0;
859861
goto out;
860862
}
861-
/* mapping in first tail page is used for compound_mapcount() */
862-
if (page - head_page == 1) {
863+
switch (page - head_page) {
864+
case 1:
865+
/* the first tail page: ->mapping is compound_mapcount() */
863866
if (unlikely(compound_mapcount(page))) {
864867
bad_page(page, "nonzero compound_mapcount", 0);
865868
goto out;
866869
}
867-
} else if (page->mapping != TAIL_MAPPING) {
868-
bad_page(page, "corrupted mapping in tail page", 0);
869-
goto out;
870+
break;
871+
case 2:
872+
/*
873+
* the second tail page: ->mapping is
874+
* page_deferred_list().next -- ignore value.
875+
*/
876+
break;
877+
default:
878+
if (page->mapping != TAIL_MAPPING) {
879+
bad_page(page, "corrupted mapping in tail page", 0);
880+
goto out;
881+
}
882+
break;
870883
}
871884
if (unlikely(!PageTail(page))) {
872885
bad_page(page, "PageTail not set", 0);

mm/rmap.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1282,8 +1282,10 @@ static void page_remove_anon_compound_rmap(struct page *page)
12821282
nr = HPAGE_PMD_NR;
12831283
}
12841284

1285-
if (nr)
1285+
if (nr) {
12861286
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
1287+
deferred_split_huge_page(page);
1288+
}
12871289
}
12881290

12891291
/**
@@ -1318,6 +1320,9 @@ void page_remove_rmap(struct page *page, bool compound)
13181320
if (unlikely(PageMlocked(page)))
13191321
clear_page_mlock(page);
13201322

1323+
if (PageTransCompound(page))
1324+
deferred_split_huge_page(compound_head(page));
1325+
13211326
/*
13221327
* It would be tidy to reset the PageAnon mapping here,
13231328
* but that might overwrite a racing page_add_anon_rmap

0 commit comments

Comments
 (0)