Skip to content

Commit aa48e47

Browse files
shakeelbtorvalds
authored andcommitted
memcg: infrastructure to flush memcg stats
At the moment memcg stats are read in four contexts: 1. memcg stat user interfaces 2. dirty throttling 3. page fault 4. memory reclaim Currently the kernel flushes the stats for first two cases. Flushing the stats for remaining two casese may have performance impact. Always flushing the memcg stats on the page fault code path may negatively impacts the performance of the applications. In addition flushing in the memory reclaim code path, though treated as slowpath, can become the source of contention for the global lock taken for stat flushing because when system or memcg is under memory pressure, many tasks may enter the reclaim path. This patch uses following mechanisms to solve these challenges: 1. Periodically flush the stats from root memcg every 2 seconds. This will time limit the out of sync stats. 2. Asynchronously flush the stats after fixed number of stat updates. In the worst case the stat can be out of sync by O(nr_cpus * BATCH) for 2 seconds. 3. For avoiding thundering herd to flush the stats particularly from the memory reclaim context, introduce memcg local spinlock and let only one flusher active at a time. This could have been done through cgroup_rstat_lock lock but that lock is used by other subsystem and for userspace reading memcg stats. So, it is better to keep flushers introduced by this patch decoupled from cgroup_rstat_lock. However we would have to use irqsafe version of rstat flush but that is fine as this code path will be flushing for whole tree and do the work for everyone. No one will be waiting for that worker. [[email protected]: fix sleep-in-wrong context bug] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Shakeel Butt <[email protected]> Tested-by: Marek Szyprowski <[email protected]> Cc: Hillf Danton <[email protected]> Cc: Huang Ying <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Michal Koutný <[email protected]> Cc: Muchun Song <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Tejun Heo <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 7e1c0d6 commit aa48e47

File tree

3 files changed

+46
-0
lines changed

3 files changed

+46
-0
lines changed

include/linux/memcontrol.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
10231023
return x;
10241024
}
10251025

1026+
void mem_cgroup_flush_stats(void);
1027+
10261028
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
10271029
int val);
10281030
void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
@@ -1438,6 +1440,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
14381440
return node_page_state(lruvec_pgdat(lruvec), idx);
14391441
}
14401442

1443+
static inline void mem_cgroup_flush_stats(void)
1444+
{
1445+
}
1446+
14411447
static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
14421448
enum node_stat_item idx, int val)
14431449
{

mm/memcontrol.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,14 @@ static bool do_memsw_account(void)
103103
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
104104
}
105105

106+
/* memcg and lruvec stats flushing */
107+
static void flush_memcg_stats_dwork(struct work_struct *w);
108+
static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
109+
static void flush_memcg_stats_work(struct work_struct *w);
110+
static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work);
111+
static DEFINE_PER_CPU(unsigned int, stats_flush_threshold);
112+
static DEFINE_SPINLOCK(stats_flush_lock);
113+
106114
#define THRESHOLDS_EVENTS_TARGET 128
107115
#define SOFTLIMIT_EVENTS_TARGET 1024
108116

@@ -674,6 +682,8 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
674682

675683
/* Update lruvec */
676684
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
685+
if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH))
686+
queue_work(system_unbound_wq, &stats_flush_work);
677687
}
678688

679689
/**
@@ -5240,6 +5250,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
52405250
/* Online state pins memcg ID, memcg ID pins CSS */
52415251
refcount_set(&memcg->id.ref, 1);
52425252
css_get(css);
5253+
5254+
if (unlikely(mem_cgroup_is_root(memcg)))
5255+
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
5256+
2UL*HZ);
52435257
return 0;
52445258
}
52455259

@@ -5331,6 +5345,26 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
53315345
memcg_wb_domain_size_changed(memcg);
53325346
}
53335347

5348+
void mem_cgroup_flush_stats(void)
5349+
{
5350+
if (!spin_trylock(&stats_flush_lock))
5351+
return;
5352+
5353+
cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
5354+
spin_unlock(&stats_flush_lock);
5355+
}
5356+
5357+
static void flush_memcg_stats_dwork(struct work_struct *w)
5358+
{
5359+
mem_cgroup_flush_stats();
5360+
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
5361+
}
5362+
5363+
static void flush_memcg_stats_work(struct work_struct *w)
5364+
{
5365+
mem_cgroup_flush_stats();
5366+
}
5367+
53345368
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
53355369
{
53365370
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

mm/vmscan.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2897,6 +2897,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
28972897
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
28982898

28992899
again:
2900+
/*
2901+
* Flush the memory cgroup stats, so that we read accurate per-memcg
2902+
* lruvec stats for heuristics.
2903+
*/
2904+
mem_cgroup_flush_stats();
2905+
29002906
memset(&sc->nr, 0, sizeof(sc->nr));
29012907

29022908
nr_reclaimed = sc->nr_reclaimed;

0 commit comments

Comments
 (0)