Skip to content

Commit 2d146aa

Browse files
hnaztorvalds
authored andcommitted
mm: memcontrol: switch to rstat
Replace the memory controller's custom hierarchical stats code with the generic rstat infrastructure provided by the cgroup core. The current implementation does batched upward propagation from the write side (i.e. as stats change). The per-cpu batches introduce an error, which is multiplied by the number of subgroups in a tree. In systems with many CPUs and sizable cgroup trees, the error can be large enough to confuse users (e.g. 32 batch pages * 32 CPUs * 32 subgroups results in an error of up to 128M per stat item). This can entirely swallow allocation bursts inside a workload that the user is expecting to see reflected in the statistics. In the past, we've done read-side aggregation, where a memory.stat read would have to walk the entire subtree and add up per-cpu counts. This became problematic with lazily-freed cgroups: we could have large subtrees where most cgroups were entirely idle. Hence the switch to change-driven upward propagation. Unfortunately, it needed to trade accuracy for speed due to the write side being so hot. Rstat combines the best of both worlds: from the write side, it cheaply maintains a queue of cgroups that have pending changes, so that the read side can do selective tree aggregation. This way the reported stats will always be precise and recent as can be, while the aggregation can skip over potentially large numbers of idle cgroups. The way rstat works is that it implements a tree for tracking cgroups with pending local changes, as well as a flush function that walks the tree upwards. The controller then drives this by 1) telling rstat when a local cgroup stat changes (e.g. mod_memcg_state) and 2) when a flush is required to get uptodate hierarchy stats for a given subtree (e.g. when memory.stat is read). The controller also provides a flush callback that is called during the rstat flush walk for each cgroup and aggregates its local per-cpu counters and propagates them upwards. This adds a second vmstats to struct mem_cgroup (MEMCG_NR_STAT + NR_VM_EVENT_ITEMS) to track pending subtree deltas during upward aggregation. It removes 3 words from the per-cpu data. It eliminates memcg_exact_page_state(), since memcg_page_state() is now exact. [[email protected]: merge fix] [[email protected]: fix a sleep in atomic section problem] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Johannes Weiner <[email protected]> Reviewed-by: Roman Gushchin <[email protected]> Acked-by: Michal Hocko <[email protected]> Reviewed-by: Shakeel Butt <[email protected]> Reviewed-by: Michal Koutný <[email protected]> Acked-by: Balbir Singh <[email protected]> Cc: Tejun Heo <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent dc26532 commit 2d146aa

File tree

2 files changed

+127
-158
lines changed

2 files changed

+127
-158
lines changed

include/linux/memcontrol.h

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,27 @@ enum mem_cgroup_events_target {
7676
};
7777

7878
struct memcg_vmstats_percpu {
79-
long stat[MEMCG_NR_STAT];
80-
unsigned long events[NR_VM_EVENT_ITEMS];
81-
unsigned long nr_page_events;
82-
unsigned long targets[MEM_CGROUP_NTARGETS];
79+
/* Local (CPU and cgroup) page state & events */
80+
long state[MEMCG_NR_STAT];
81+
unsigned long events[NR_VM_EVENT_ITEMS];
82+
83+
/* Delta calculation for lockless upward propagation */
84+
long state_prev[MEMCG_NR_STAT];
85+
unsigned long events_prev[NR_VM_EVENT_ITEMS];
86+
87+
/* Cgroup1: threshold notifications & softlimit tree updates */
88+
unsigned long nr_page_events;
89+
unsigned long targets[MEM_CGROUP_NTARGETS];
90+
};
91+
92+
struct memcg_vmstats {
93+
/* Aggregated (CPU and subtree) page state & events */
94+
long state[MEMCG_NR_STAT];
95+
unsigned long events[NR_VM_EVENT_ITEMS];
96+
97+
/* Pending child counts during tree propagation */
98+
long state_pending[MEMCG_NR_STAT];
99+
unsigned long events_pending[NR_VM_EVENT_ITEMS];
83100
};
84101

85102
struct mem_cgroup_reclaim_iter {
@@ -287,8 +304,8 @@ struct mem_cgroup {
287304

288305
MEMCG_PADDING(_pad1_);
289306

290-
atomic_long_t vmstats[MEMCG_NR_STAT];
291-
atomic_long_t vmevents[NR_VM_EVENT_ITEMS];
307+
/* memory.stat */
308+
struct memcg_vmstats vmstats;
292309

293310
/* memory.events */
294311
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
@@ -315,10 +332,6 @@ struct mem_cgroup {
315332
atomic_t moving_account;
316333
struct task_struct *move_lock_task;
317334

318-
/* Legacy local VM stats and events */
319-
struct memcg_vmstats_percpu __percpu *vmstats_local;
320-
321-
/* Subtree VM stats and events (batched updates) */
322335
struct memcg_vmstats_percpu __percpu *vmstats_percpu;
323336

324337
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -939,10 +952,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
939952
local_irq_restore(flags);
940953
}
941954

942-
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
943-
gfp_t gfp_mask,
944-
unsigned long *total_scanned);
945-
946955
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
947956
unsigned long count);
948957

@@ -1023,6 +1032,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
10231032

10241033
void split_page_memcg(struct page *head, unsigned int nr);
10251034

1035+
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
1036+
gfp_t gfp_mask,
1037+
unsigned long *total_scanned);
1038+
10261039
#else /* CONFIG_MEMCG */
10271040

10281041
#define MEM_CGROUP_ID_SHIFT 0
@@ -1131,6 +1144,10 @@ static inline bool lruvec_holds_page_lru_lock(struct page *page,
11311144
return lruvec == &pgdat->__lruvec;
11321145
}
11331146

1147+
static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
1148+
{
1149+
}
1150+
11341151
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
11351152
{
11361153
return NULL;
@@ -1334,18 +1351,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
13341351
mod_node_page_state(page_pgdat(page), idx, val);
13351352
}
13361353

1337-
static inline
1338-
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
1339-
gfp_t gfp_mask,
1340-
unsigned long *total_scanned)
1341-
{
1342-
return 0;
1343-
}
1344-
1345-
static inline void split_page_memcg(struct page *head, unsigned int nr)
1346-
{
1347-
}
1348-
13491354
static inline void count_memcg_events(struct mem_cgroup *memcg,
13501355
enum vm_event_item idx,
13511356
unsigned long count)
@@ -1368,8 +1373,16 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
13681373
{
13691374
}
13701375

1371-
static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
1376+
static inline void split_page_memcg(struct page *head, unsigned int nr)
1377+
{
1378+
}
1379+
1380+
static inline
1381+
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
1382+
gfp_t gfp_mask,
1383+
unsigned long *total_scanned)
13721384
{
1385+
return 0;
13731386
}
13741387
#endif /* CONFIG_MEMCG */
13751388

0 commit comments

Comments
 (0)