Skip to content

Commit 87eaceb

Browse files
Yang Shitorvalds
authored andcommitted
mm: thp: make deferred split shrinker memcg aware
Currently THP deferred split shrinker is not memcg aware, this may cause premature OOM with some configuration. For example the below test would run into premature OOM easily: $ cgcreate -g memory:thp $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes $ cgexec -g memory:thp transhuge-stress 4000 transhuge-stress comes from kernel selftest. It is easy to hit OOM, but there are still a lot THP on the deferred split queue, memcg direct reclaim can't touch them since the deferred split shrinker is not memcg aware. Convert deferred split shrinker memcg aware by introducing per memcg deferred split queue. The THP should be on either per node or per memcg deferred split queue if it belongs to a memcg. When the page is immigrated to the other memcg, it will be immigrated to the target memcg's deferred split queue too. Reuse the second tail page's deferred_list for per memcg list since the same THP can't be on multiple deferred split queues. [[email protected]: simplify deferred split queue dereference per Kirill Tkhai] Link: http://lkml.kernel.org/r/[email protected] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Yang Shi <[email protected]> Acked-by: Kirill A. Shutemov <[email protected]> Reviewed-by: Kirill Tkhai <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: "Kirill A . Shutemov" <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: David Rientjes <[email protected]> Cc: Qian Cai <[email protected]> Cc: Vladimir Davydov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 0a432dc commit 87eaceb

File tree

5 files changed

+91
-9
lines changed

5 files changed

+91
-9
lines changed

include/linux/huge_mm.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void)
267267
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
268268
}
269269

270+
static inline struct list_head *page_deferred_list(struct page *page)
271+
{
272+
/*
273+
* Global or memcg deferred list in the second tail pages is
274+
* occupied by compound_head.
275+
*/
276+
return &page[2].deferred_list;
277+
}
278+
270279
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
271280
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
272281
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })

include/linux/memcontrol.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,10 @@ struct mem_cgroup {
330330
struct list_head event_list;
331331
spinlock_t event_list_lock;
332332

333+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
334+
struct deferred_split deferred_split_queue;
335+
#endif
336+
333337
struct mem_cgroup_per_node *nodeinfo[0];
334338
/* WARNING: nodeinfo must be the last member here */
335339
};

include/linux/mm_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ struct page {
138138
struct { /* Second tail page of compound page */
139139
unsigned long _compound_pad_1; /* compound_head */
140140
unsigned long _compound_pad_2;
141+
/* For both global and memcg */
141142
struct list_head deferred_list;
142143
};
143144
struct { /* Page table pages */

mm/huge_memory.c

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
496496
return pmd;
497497
}
498498

499-
static inline struct list_head *page_deferred_list(struct page *page)
499+
#ifdef CONFIG_MEMCG
500+
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
500501
{
501-
/* ->lru in the tail pages is occupied by compound_head. */
502-
return &page[2].deferred_list;
502+
struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
503+
struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
504+
505+
if (memcg)
506+
return &memcg->deferred_split_queue;
507+
else
508+
return &pgdat->deferred_split_queue;
503509
}
510+
#else
511+
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
512+
{
513+
struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
514+
515+
return &pgdat->deferred_split_queue;
516+
}
517+
#endif
504518

505519
void prep_transhuge_page(struct page *page)
506520
{
@@ -2691,7 +2705,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
26912705
{
26922706
struct page *head = compound_head(page);
26932707
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
2694-
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2708+
struct deferred_split *ds_queue = get_deferred_split_queue(page);
26952709
struct anon_vma *anon_vma = NULL;
26962710
struct address_space *mapping = NULL;
26972711
int count, mapcount, extra_pins, ret;
@@ -2827,8 +2841,7 @@ fail: if (mapping)
28272841

28282842
void free_transhuge_page(struct page *page)
28292843
{
2830-
struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
2831-
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2844+
struct deferred_split *ds_queue = get_deferred_split_queue(page);
28322845
unsigned long flags;
28332846

28342847
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -2842,17 +2855,37 @@ void free_transhuge_page(struct page *page)
28422855

28432856
void deferred_split_huge_page(struct page *page)
28442857
{
2845-
struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
2846-
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2858+
struct deferred_split *ds_queue = get_deferred_split_queue(page);
2859+
#ifdef CONFIG_MEMCG
2860+
struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
2861+
#endif
28472862
unsigned long flags;
28482863

28492864
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
28502865

2866+
/*
2867+
* The try_to_unmap() in page reclaim path might reach here too,
2868+
* this may cause a race condition to corrupt deferred split queue.
2869+
* And, if page reclaim is already handling the same page, it is
2870+
* unnecessary to handle it again in shrinker.
2871+
*
2872+
* Check PageSwapCache to determine if the page is being
2873+
* handled by page reclaim since THP swap would add the page into
2874+
* swap cache before calling try_to_unmap().
2875+
*/
2876+
if (PageSwapCache(page))
2877+
return;
2878+
28512879
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
28522880
if (list_empty(page_deferred_list(page))) {
28532881
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
28542882
list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
28552883
ds_queue->split_queue_len++;
2884+
#ifdef CONFIG_MEMCG
2885+
if (memcg)
2886+
memcg_set_shrinker_bit(memcg, page_to_nid(page),
2887+
deferred_split_shrinker.id);
2888+
#endif
28562889
}
28572890
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
28582891
}
@@ -2862,6 +2895,11 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
28622895
{
28632896
struct pglist_data *pgdata = NODE_DATA(sc->nid);
28642897
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2898+
2899+
#ifdef CONFIG_MEMCG
2900+
if (sc->memcg)
2901+
ds_queue = &sc->memcg->deferred_split_queue;
2902+
#endif
28652903
return READ_ONCE(ds_queue->split_queue_len);
28662904
}
28672905

@@ -2875,6 +2913,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
28752913
struct page *page;
28762914
int split = 0;
28772915

2916+
#ifdef CONFIG_MEMCG
2917+
if (sc->memcg)
2918+
ds_queue = &sc->memcg->deferred_split_queue;
2919+
#endif
2920+
28782921
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
28792922
/* Take pin on all head pages to avoid freeing them under us */
28802923
list_for_each_safe(pos, next, &ds_queue->split_queue) {
@@ -2921,7 +2964,8 @@ static struct shrinker deferred_split_shrinker = {
29212964
.count_objects = deferred_split_count,
29222965
.scan_objects = deferred_split_scan,
29232966
.seeks = DEFAULT_SEEKS,
2924-
.flags = SHRINKER_NUMA_AWARE,
2967+
.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
2968+
SHRINKER_NONSLAB,
29252969
};
29262970

29272971
#ifdef CONFIG_DEBUG_FS

mm/memcontrol.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5070,6 +5070,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
50705070
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
50715071
memcg->cgwb_frn[i].done =
50725072
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5073+
#endif
5074+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5075+
spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5076+
INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5077+
memcg->deferred_split_queue.split_queue_len = 0;
50735078
#endif
50745079
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
50755080
return memcg;
@@ -5449,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page,
54495454
__mod_memcg_state(to, NR_WRITEBACK, nr_pages);
54505455
}
54515456

5457+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5458+
if (compound && !list_empty(page_deferred_list(page))) {
5459+
spin_lock(&from->deferred_split_queue.split_queue_lock);
5460+
list_del_init(page_deferred_list(page));
5461+
from->deferred_split_queue.split_queue_len--;
5462+
spin_unlock(&from->deferred_split_queue.split_queue_lock);
5463+
}
5464+
#endif
54525465
/*
54535466
* It is safe to change page->mem_cgroup here because the page
54545467
* is referenced, charged, and isolated - we can't race with
@@ -5457,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page,
54575470

54585471
/* caller should have done css_get */
54595472
page->mem_cgroup = to;
5473+
5474+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5475+
if (compound && list_empty(page_deferred_list(page))) {
5476+
spin_lock(&to->deferred_split_queue.split_queue_lock);
5477+
list_add_tail(page_deferred_list(page),
5478+
&to->deferred_split_queue.split_queue);
5479+
to->deferred_split_queue.split_queue_len++;
5480+
spin_unlock(&to->deferred_split_queue.split_queue_lock);
5481+
}
5482+
#endif
5483+
54605484
spin_unlock_irqrestore(&from->move_lock, flags);
54615485

54625486
ret = 0;

0 commit comments

Comments
 (0)