Skip to content

Commit 8e8ae64

Browse files
hnaztorvalds
authored andcommitted
mm: memcontrol: hook up vmpressure to socket pressure
Let the networking stack know when a memcg is under reclaim pressure so that it can clamp its transmit windows accordingly. Whenever the reclaim efficiency of a cgroup's LRU lists drops low enough for a MEDIUM or HIGH vmpressure event to occur, assert a pressure state in the socket and tcp memory code that tells it to curb consumption growth from sockets associated with said control group. Traditionally, vmpressure reports for the entire subtree of a memcg under pressure, which drops useful information on the individual groups reclaimed. However, it's too late to change the userinterface, so add a second reporting mode that reports on the level of reclaim instead of at the level of pressure, and use that report for sockets. vmpressure events are naturally edge triggered, so for hysteresis assert socket pressure for a second to allow for subsequent vmpressure events to occur before letting the socket code return to normal. This will likely need finetuning for a wider variety of workloads, but for now stick to the vmpressure presets and keep hysteresis simple. Signed-off-by: Johannes Weiner <[email protected]> Acked-by: David S. Miller <[email protected]> Reviewed-by: Vladimir Davydov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent f7e1cb6 commit 8e8ae64

File tree

5 files changed

+104
-40
lines changed

5 files changed

+104
-40
lines changed

include/linux/memcontrol.h

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,10 @@ struct mem_cgroup {
249249
struct wb_domain cgwb_domain;
250250
#endif
251251

252+
#ifdef CONFIG_INET
253+
unsigned long socket_pressure;
254+
#endif
255+
252256
/* List of events which userspace want to receive */
253257
struct list_head event_list;
254258
spinlock_t event_list_lock;
@@ -290,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
290294

291295
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
292296
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
293-
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
294297

295298
static inline
296299
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
297300
return css ? container_of(css, struct mem_cgroup, css) : NULL;
298301
}
299302

303+
#define mem_cgroup_from_counter(counter, member) \
304+
container_of(counter, struct mem_cgroup, member)
305+
300306
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
301307
struct mem_cgroup *,
302308
struct mem_cgroup_reclaim_cookie *);
303309
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
304310

311+
/**
312+
* parent_mem_cgroup - find the accounting parent of a memcg
313+
* @memcg: memcg whose parent to find
314+
*
315+
* Returns the parent memcg, or NULL if this is the root or the memory
316+
* controller is in legacy no-hierarchy mode.
317+
*/
318+
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
319+
{
320+
if (!memcg->memory.parent)
321+
return NULL;
322+
return mem_cgroup_from_counter(memcg->memory.parent, memory);
323+
}
324+
305325
static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
306326
struct mem_cgroup *root)
307327
{
@@ -689,10 +709,14 @@ extern struct static_key memcg_sockets_enabled_key;
689709
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
690710
{
691711
#ifdef CONFIG_MEMCG_KMEM
692-
return memcg->tcp_mem.memory_pressure;
693-
#else
694-
return false;
712+
if (memcg->tcp_mem.memory_pressure)
713+
return true;
695714
#endif
715+
do {
716+
if (time_before(jiffies, memcg->socket_pressure))
717+
return true;
718+
} while ((memcg = parent_mem_cgroup(memcg)));
719+
return false;
696720
}
697721
#else
698722
#define mem_cgroup_sockets_enabled 0

include/linux/vmpressure.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
struct vmpressure {
1313
unsigned long scanned;
1414
unsigned long reclaimed;
15+
16+
unsigned long tree_scanned;
17+
unsigned long tree_reclaimed;
1518
/* The lock is used to keep the scanned/reclaimed above in sync. */
1619
struct spinlock sr_lock;
1720

@@ -26,7 +29,7 @@ struct vmpressure {
2629
struct mem_cgroup;
2730

2831
#ifdef CONFIG_MEMCG
29-
extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
32+
extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
3033
unsigned long scanned, unsigned long reclaimed);
3134
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
3235

@@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
4043
extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
4144
struct eventfd_ctx *eventfd);
4245
#else
43-
static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
46+
static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
4447
unsigned long scanned, unsigned long reclaimed) {}
4548
static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
4649
int prio) {}

mm/memcontrol.c

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
11131113
return ret;
11141114
}
11151115

1116-
#define mem_cgroup_from_counter(counter, member) \
1117-
container_of(counter, struct mem_cgroup, member)
1118-
11191116
/**
11201117
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
11211118
* @memcg: the memory cgroup
@@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
41834180
kfree(memcg);
41844181
}
41854182

4186-
/*
4187-
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4188-
*/
4189-
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4190-
{
4191-
if (!memcg->memory.parent)
4192-
return NULL;
4193-
return mem_cgroup_from_counter(memcg->memory.parent, memory);
4194-
}
4195-
EXPORT_SYMBOL(parent_mem_cgroup);
4196-
41974183
static struct cgroup_subsys_state * __ref
41984184
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
41994185
{
@@ -4233,6 +4219,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
42334219
#endif
42344220
#ifdef CONFIG_CGROUP_WRITEBACK
42354221
INIT_LIST_HEAD(&memcg->cgwb_list);
4222+
#endif
4223+
#ifdef CONFIG_INET
4224+
memcg->socket_pressure = jiffies;
42364225
#endif
42374226
return &memcg->css;
42384227

mm/vmpressure.c

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -137,14 +137,11 @@ struct vmpressure_event {
137137
};
138138

139139
static bool vmpressure_event(struct vmpressure *vmpr,
140-
unsigned long scanned, unsigned long reclaimed)
140+
enum vmpressure_levels level)
141141
{
142142
struct vmpressure_event *ev;
143-
enum vmpressure_levels level;
144143
bool signalled = false;
145144

146-
level = vmpressure_calc_level(scanned, reclaimed);
147-
148145
mutex_lock(&vmpr->events_lock);
149146

150147
list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
164161
struct vmpressure *vmpr = work_to_vmpressure(work);
165162
unsigned long scanned;
166163
unsigned long reclaimed;
164+
enum vmpressure_levels level;
167165

168166
spin_lock(&vmpr->sr_lock);
169167
/*
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
174172
* here. No need for any locks here since we don't care if
175173
* vmpr->reclaimed is in sync.
176174
*/
177-
scanned = vmpr->scanned;
175+
scanned = vmpr->tree_scanned;
178176
if (!scanned) {
179177
spin_unlock(&vmpr->sr_lock);
180178
return;
181179
}
182180

183-
reclaimed = vmpr->reclaimed;
184-
vmpr->scanned = 0;
185-
vmpr->reclaimed = 0;
181+
reclaimed = vmpr->tree_reclaimed;
182+
vmpr->tree_scanned = 0;
183+
vmpr->tree_reclaimed = 0;
186184
spin_unlock(&vmpr->sr_lock);
187185

186+
level = vmpressure_calc_level(scanned, reclaimed);
187+
188188
do {
189-
if (vmpressure_event(vmpr, scanned, reclaimed))
189+
if (vmpressure_event(vmpr, level))
190190
break;
191191
/*
192192
* If not handled, propagate the event upward into the
@@ -199,16 +199,24 @@ static void vmpressure_work_fn(struct work_struct *work)
199199
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
200200
* @gfp: reclaimer's gfp mask
201201
* @memcg: cgroup memory controller handle
202+
* @tree: legacy subtree mode
202203
* @scanned: number of pages scanned
203204
* @reclaimed: number of pages reclaimed
204205
*
205206
* This function should be called from the vmscan reclaim path to account
206207
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
207208
* pressure index is then further refined and averaged over time.
208209
*
210+
* If @tree is set, vmpressure is in traditional userspace reporting
211+
* mode: @memcg is considered the pressure root and userspace is
212+
* notified of the entire subtree's reclaim efficiency.
213+
*
214+
* If @tree is not set, reclaim efficiency is recorded for @memcg, and
215+
* only in-kernel users are notified.
216+
*
209217
* This function does not return any value.
210218
*/
211-
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
219+
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
212220
unsigned long scanned, unsigned long reclaimed)
213221
{
214222
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
238246
if (!scanned)
239247
return;
240248

241-
spin_lock(&vmpr->sr_lock);
242-
vmpr->scanned += scanned;
243-
vmpr->reclaimed += reclaimed;
244-
scanned = vmpr->scanned;
245-
spin_unlock(&vmpr->sr_lock);
249+
if (tree) {
250+
spin_lock(&vmpr->sr_lock);
251+
vmpr->tree_scanned += scanned;
252+
vmpr->tree_reclaimed += reclaimed;
253+
scanned = vmpr->scanned;
254+
spin_unlock(&vmpr->sr_lock);
246255

247-
if (scanned < vmpressure_win)
248-
return;
249-
schedule_work(&vmpr->work);
256+
if (scanned < vmpressure_win)
257+
return;
258+
schedule_work(&vmpr->work);
259+
} else {
260+
enum vmpressure_levels level;
261+
262+
/* For now, no users for root-level efficiency */
263+
if (memcg == root_mem_cgroup)
264+
return;
265+
266+
spin_lock(&vmpr->sr_lock);
267+
scanned = vmpr->scanned += scanned;
268+
reclaimed = vmpr->reclaimed += reclaimed;
269+
if (scanned < vmpressure_win) {
270+
spin_unlock(&vmpr->sr_lock);
271+
return;
272+
}
273+
vmpr->scanned = vmpr->reclaimed = 0;
274+
spin_unlock(&vmpr->sr_lock);
275+
276+
level = vmpressure_calc_level(scanned, reclaimed);
277+
278+
if (level > VMPRESSURE_LOW) {
279+
/*
280+
* Let the socket buffer allocator know that
281+
* we are having trouble reclaiming LRU pages.
282+
*
283+
* For hysteresis keep the pressure state
284+
* asserted for a second in which subsequent
285+
* pressure events can occur.
286+
*/
287+
memcg->socket_pressure = jiffies + HZ;
288+
}
289+
}
250290
}
251291

252292
/**
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
276316
* to the vmpressure() basically means that we signal 'critical'
277317
* level.
278318
*/
279-
vmpressure(gfp, memcg, vmpressure_win, 0);
319+
vmpressure(gfp, memcg, true, vmpressure_win, 0);
280320
}
281321

282322
/**

mm/vmscan.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
23962396
memcg = mem_cgroup_iter(root, NULL, &reclaim);
23972397
do {
23982398
unsigned long lru_pages;
2399+
unsigned long reclaimed;
23992400
unsigned long scanned;
24002401
struct lruvec *lruvec;
24012402
int swappiness;
@@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
24082409

24092410
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
24102411
swappiness = mem_cgroup_swappiness(memcg);
2412+
reclaimed = sc->nr_reclaimed;
24112413
scanned = sc->nr_scanned;
24122414

24132415
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
@@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
24182420
memcg, sc->nr_scanned - scanned,
24192421
lru_pages);
24202422

2423+
/* Record the group's reclaim efficiency */
2424+
vmpressure(sc->gfp_mask, memcg, false,
2425+
sc->nr_scanned - scanned,
2426+
sc->nr_reclaimed - reclaimed);
2427+
24212428
/*
24222429
* Direct reclaim and kswapd have to scan all memory
24232430
* cgroups to fulfill the overall scan target for the
@@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
24492456
reclaim_state->reclaimed_slab = 0;
24502457
}
24512458

2452-
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2459+
/* Record the subtree's reclaim efficiency */
2460+
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
24532461
sc->nr_scanned - nr_scanned,
24542462
sc->nr_reclaimed - nr_reclaimed);
24552463

0 commit comments

Comments
 (0)