Skip to content

Commit dafff3f

Browse files
uarif1hnaz
authored andcommitted
mm: split underused THPs
This is an attempt to mitigate the issue of running out of memory when THP is always enabled. During runtime whenever a THP is being faulted in (__do_huge_pmd_anonymous_page) or collapsed by khugepaged (collapse_huge_page), the THP is added to _deferred_list. Whenever memory reclaim happens in linux, the kernel runs the deferred_split shrinker which goes through the _deferred_list. If the folio was partially mapped, the shrinker attempts to split it. If the folio is not partially mapped, the shrinker checks if the THP was underused, i.e. how many of the base 4K pages of the entire THP were zero-filled. If this number goes above a certain threshold (decided by /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none), the shrinker will attempt to split that THP. Then at remap time, the pages that were zero-filled are mapped to the shared zeropage, hence saving memory. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Usama Arif <[email protected]> Suggested-by: Rik van Riel <[email protected]> Co-authored-by: Johannes Weiner <[email protected]> Cc: Alexander Zhu <[email protected]> Cc: Barry Song <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Domenico Cerasuolo <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Kairui Song <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Mike Rapoport <[email protected]> Cc: Nico Pache <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Ryan Roberts <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Shuang Zhai <[email protected]> Cc: Yu Zhao <[email protected]> Cc: Shuang Zhai <[email protected]> Cc: Hugh Dickins <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 8422acd commit dafff3f

File tree

6 files changed

+69
-3
lines changed

6 files changed

+69
-3
lines changed

Documentation/admin-guide/mm/transhuge.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,12 @@ thp_deferred_split_page
471471
splitting it would free up some memory. Pages on split queue are
472472
going to be split under memory pressure.
473473

474+
thp_underused_split_page
475+
is incremented when a huge page on the split queue was split
476+
because it was underused. A THP is underused if the number of
477+
zero pages in the THP is above a certain threshold
478+
(/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none).
479+
474480
thp_split_pmd
475481
is incremented every time a PMD split into table of PTEs.
476482
This can happen, for instance, when application calls mprotect() or

include/linux/khugepaged.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
66

7+
extern unsigned int khugepaged_max_ptes_none __read_mostly;
78
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
89
extern struct attribute_group khugepaged_attr_group;
910

include/linux/vm_event_item.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
105105
THP_SPLIT_PAGE,
106106
THP_SPLIT_PAGE_FAILED,
107107
THP_DEFERRED_SPLIT_PAGE,
108+
THP_UNDERUSED_SPLIT_PAGE,
108109
THP_SPLIT_PMD,
109110
THP_SCAN_EXCEED_NONE_PTE,
110111
THP_SCAN_EXCEED_SWAP_PTE,

mm/huge_memory.c

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,6 +1187,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
11871187
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
11881188
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
11891189
mm_inc_nr_ptes(vma->vm_mm);
1190+
deferred_split_folio(folio, false);
11901191
spin_unlock(vmf->ptl);
11911192
count_vm_event(THP_FAULT_ALLOC);
11921193
count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
@@ -3608,6 +3609,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
36083609
return READ_ONCE(ds_queue->split_queue_len);
36093610
}
36103611

3612+
static bool thp_underused(struct folio *folio)
3613+
{
3614+
int num_zero_pages = 0, num_filled_pages = 0;
3615+
void *kaddr;
3616+
int i;
3617+
3618+
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
3619+
return false;
3620+
3621+
for (i = 0; i < folio_nr_pages(folio); i++) {
3622+
kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
3623+
if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
3624+
num_zero_pages++;
3625+
if (num_zero_pages > khugepaged_max_ptes_none) {
3626+
kunmap_local(kaddr);
3627+
return true;
3628+
}
3629+
} else {
3630+
/*
3631+
* Another path for early exit once the number
3632+
* of non-zero filled pages exceeds threshold.
3633+
*/
3634+
num_filled_pages++;
3635+
if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
3636+
kunmap_local(kaddr);
3637+
return false;
3638+
}
3639+
}
3640+
kunmap_local(kaddr);
3641+
}
3642+
return false;
3643+
}
3644+
36113645
static unsigned long deferred_split_scan(struct shrinker *shrink,
36123646
struct shrink_control *sc)
36133647
{
@@ -3645,13 +3679,35 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
36453679
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
36463680

36473681
list_for_each_entry_safe(folio, next, &list, _deferred_list) {
3682+
bool did_split = false;
3683+
bool underused = false;
3684+
3685+
if (!folio_test_partially_mapped(folio)) {
3686+
underused = thp_underused(folio);
3687+
if (!underused)
3688+
goto next;
3689+
}
36483690
if (!folio_trylock(folio))
36493691
goto next;
3650-
/* split_huge_page() removes page from list on success */
3651-
if (!split_folio(folio))
3692+
if (!split_folio(folio)) {
3693+
did_split = true;
3694+
if (underused)
3695+
count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
36523696
split++;
3697+
}
36533698
folio_unlock(folio);
36543699
next:
3700+
/*
3701+
* split_folio() removes folio from list on success.
3702+
* Only add back to the queue if folio is partially mapped.
3703+
* If thp_underused returns false, or if split_folio fails
3704+
* in the case it was underused, then consider it used and
3705+
* don't add it back to split_queue.
3706+
*/
3707+
if (!did_split && !folio_test_partially_mapped(folio)) {
3708+
list_del_init(&folio->_deferred_list);
3709+
ds_queue->split_queue_len--;
3710+
}
36553711
folio_put(folio);
36563712
}
36573713

mm/khugepaged.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
8585
*
8686
* Note that these are only respected if collapse was initiated by khugepaged.
8787
*/
88-
static unsigned int khugepaged_max_ptes_none __read_mostly;
88+
unsigned int khugepaged_max_ptes_none __read_mostly;
8989
static unsigned int khugepaged_max_ptes_swap __read_mostly;
9090
static unsigned int khugepaged_max_ptes_shared __read_mostly;
9191

@@ -1237,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
12371237
pgtable_trans_huge_deposit(mm, pmd, pgtable);
12381238
set_pmd_at(mm, address, pmd, _pmd);
12391239
update_mmu_cache_pmd(vma, address, pmd);
1240+
deferred_split_folio(folio, false);
12401241
spin_unlock(pmd_ptl);
12411242

12421243
folio = NULL;

mm/vmstat.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,6 +1385,7 @@ const char * const vmstat_text[] = {
13851385
"thp_split_page",
13861386
"thp_split_page_failed",
13871387
"thp_deferred_split_page",
1388+
"thp_underused_split_page",
13881389
"thp_split_pmd",
13891390
"thp_scan_exceed_none_pte",
13901391
"thp_scan_exceed_swap_pte",

0 commit comments

Comments
 (0)