Skip to content

Commit 5abc1e3

Browse files
Muchun Songtorvalds
authored andcommitted
mm: list_lru: allocate list_lru_one only when needed
In our server, we found a suspected memory leak problem. The kmalloc-32 consumes more than 6GB of memory. Other kmem_caches consume less than 2GB memory. After our in-depth analysis, the memory consumption of kmalloc-32 slab cache is the cause of list_lru_one allocation. crash> p memcg_nr_cache_ids memcg_nr_cache_ids = $2 = 24574 memcg_nr_cache_ids is very large and memory consumption of each list_lru can be calculated with the following formula. num_numa_node * memcg_nr_cache_ids * 32 (kmalloc-32) There are 4 numa nodes in our system, so each list_lru consumes ~3MB. crash> list super_blocks | wc -l 952 Every mount will register 2 list lrus, one is for inode, another is for dentry. There are 952 super_blocks. So the total memory is 952 * 2 * 3 MB (~5.6GB). But the number of memory cgroup is less than 500. So I guess more than 12286 containers have been deployed on this machine (I do not know why there are so many containers, it may be a user's bug or the user really want to do that). And memcg_nr_cache_ids has not been reduced to a suitable value. This can waste a lot of memory. Now the infrastructure for dynamic list_lru_one allocation is ready, so remove statically allocated memory code to save memory. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Muchun Song <[email protected]> Cc: Alex Shi <[email protected]> Cc: Anna Schumaker <[email protected]> Cc: Chao Yu <[email protected]> Cc: Dave Chinner <[email protected]> Cc: Fam Zheng <[email protected]> Cc: Jaegeuk Kim <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Kari Argillander <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Qi Zheng <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Theodore Ts'o <[email protected]> Cc: Trond Myklebust <[email protected]> Cc: Vladimir Davydov <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Wei Yang <[email protected]> Cc: Xiongchun Duan <[email protected]> Cc: Yang Shi <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent da0efe3 commit 5abc1e3

File tree

3 files changed

+77
-57
lines changed

3 files changed

+77
-57
lines changed

include/linux/list_lru.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,15 @@ struct list_lru_one {
3232
};
3333

3434
struct list_lru_per_memcg {
35+
struct rcu_head rcu;
3536
/* array of per cgroup per node lists, indexed by node id */
36-
struct list_lru_one node[0];
37+
struct list_lru_one node[];
3738
};
3839

3940
struct list_lru_memcg {
4041
struct rcu_head rcu;
4142
/* array of per cgroup lists, indexed by memcg_cache_id */
42-
struct list_lru_per_memcg *mlru[];
43+
struct list_lru_per_memcg __rcu *mlru[];
4344
};
4445

4546
struct list_lru_node {
@@ -77,7 +78,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
7778
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
7879
gfp_t gfp);
7980
int memcg_update_all_list_lrus(int num_memcgs);
80-
void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg);
81+
void memcg_drain_all_list_lrus(struct mem_cgroup *src, struct mem_cgroup *dst);
8182

8283
/**
8384
* list_lru_add: add an element to the lru list's tail

mm/list_lru.c

Lines changed: 68 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
6060
* from relocation (see memcg_update_list_lru).
6161
*/
6262
mlrus = rcu_dereference_check(lru->mlrus, lockdep_is_held(&nlru->lock));
63-
if (mlrus && idx >= 0)
64-
return &mlrus->mlru[idx]->node[nid];
63+
if (mlrus && idx >= 0) {
64+
struct list_lru_per_memcg *mlru;
65+
66+
mlru = rcu_dereference_check(mlrus->mlru[idx], true);
67+
return mlru ? &mlru->node[nid] : NULL;
68+
}
6569
return &nlru->lru;
6670
}
6771

@@ -188,7 +192,7 @@ unsigned long list_lru_count_one(struct list_lru *lru,
188192

189193
rcu_read_lock();
190194
l = list_lru_from_memcg_idx(lru, nid, memcg_cache_id(memcg));
191-
count = READ_ONCE(l->nr_items);
195+
count = l ? READ_ONCE(l->nr_items) : 0;
192196
rcu_read_unlock();
193197

194198
if (unlikely(count < 0))
@@ -217,8 +221,11 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
217221
struct list_head *item, *n;
218222
unsigned long isolated = 0;
219223

220-
l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
221224
restart:
225+
l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
226+
if (!l)
227+
goto out;
228+
222229
list_for_each_safe(item, n, &l->list) {
223230
enum lru_status ret;
224231

@@ -262,6 +269,7 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
262269
BUG();
263270
}
264271
}
272+
out:
265273
return isolated;
266274
}
267275

@@ -354,20 +362,25 @@ static struct list_lru_per_memcg *memcg_init_list_lru_one(gfp_t gfp)
354362
return mlru;
355363
}
356364

357-
static int memcg_init_list_lru_range(struct list_lru_memcg *mlrus,
358-
int begin, int end)
365+
static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
359366
{
360-
int i;
367+
struct list_lru_memcg *mlrus;
368+
struct list_lru_per_memcg *mlru;
361369

362-
for (i = begin; i < end; i++) {
363-
mlrus->mlru[i] = memcg_init_list_lru_one(GFP_KERNEL);
364-
if (!mlrus->mlru[i])
365-
goto fail;
366-
}
367-
return 0;
368-
fail:
369-
memcg_destroy_list_lru_range(mlrus, begin, i);
370-
return -ENOMEM;
370+
spin_lock_irq(&lru->lock);
371+
mlrus = rcu_dereference_protected(lru->mlrus, true);
372+
mlru = rcu_dereference_protected(mlrus->mlru[src_idx], true);
373+
rcu_assign_pointer(mlrus->mlru[src_idx], NULL);
374+
spin_unlock_irq(&lru->lock);
375+
376+
/*
377+
* The __list_lru_walk_one() can walk the list of this node.
378+
* We need kvfree_rcu() here. And the walking of the list
379+
* is under lru->node[nid]->lock, which can serve as a RCU
380+
* read-side critical section.
381+
*/
382+
if (mlru)
383+
kvfree_rcu(mlru, rcu);
371384
}
372385

373386
static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
@@ -381,14 +394,10 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
381394

382395
spin_lock_init(&lru->lock);
383396

384-
mlrus = kvmalloc(struct_size(mlrus, mlru, size), GFP_KERNEL);
397+
mlrus = kvzalloc(struct_size(mlrus, mlru, size), GFP_KERNEL);
385398
if (!mlrus)
386399
return -ENOMEM;
387400

388-
if (memcg_init_list_lru_range(mlrus, 0, size)) {
389-
kvfree(mlrus);
390-
return -ENOMEM;
391-
}
392401
RCU_INIT_POINTER(lru->mlrus, mlrus);
393402

394403
return 0;
@@ -422,34 +431,16 @@ static int memcg_update_list_lru(struct list_lru *lru, int old_size, int new_siz
422431
if (!new)
423432
return -ENOMEM;
424433

425-
if (memcg_init_list_lru_range(new, old_size, new_size)) {
426-
kvfree(new);
427-
return -ENOMEM;
428-
}
429-
430434
spin_lock_irq(&lru->lock);
431435
memcpy(&new->mlru, &old->mlru, flex_array_size(new, mlru, old_size));
436+
memset(&new->mlru[old_size], 0, flex_array_size(new, mlru, new_size - old_size));
432437
rcu_assign_pointer(lru->mlrus, new);
433438
spin_unlock_irq(&lru->lock);
434439

435440
kvfree_rcu(old, rcu);
436441
return 0;
437442
}
438443

439-
static void memcg_cancel_update_list_lru(struct list_lru *lru,
440-
int old_size, int new_size)
441-
{
442-
struct list_lru_memcg *mlrus;
443-
444-
mlrus = rcu_dereference_protected(lru->mlrus,
445-
lockdep_is_held(&list_lrus_mutex));
446-
/*
447-
* Do not bother shrinking the array back to the old size, because we
448-
* cannot handle allocation failures here.
449-
*/
450-
memcg_destroy_list_lru_range(mlrus, old_size, new_size);
451-
}
452-
453444
int memcg_update_all_list_lrus(int new_size)
454445
{
455446
int ret = 0;
@@ -460,15 +451,10 @@ int memcg_update_all_list_lrus(int new_size)
460451
list_for_each_entry(lru, &memcg_list_lrus, list) {
461452
ret = memcg_update_list_lru(lru, old_size, new_size);
462453
if (ret)
463-
goto fail;
454+
break;
464455
}
465-
out:
466456
mutex_unlock(&list_lrus_mutex);
467457
return ret;
468-
fail:
469-
list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list)
470-
memcg_cancel_update_list_lru(lru, old_size, new_size);
471-
goto out;
472458
}
473459

474460
static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
@@ -485,6 +471,8 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
485471
spin_lock_irq(&nlru->lock);
486472

487473
src = list_lru_from_memcg_idx(lru, nid, src_idx);
474+
if (!src)
475+
goto out;
488476
dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
489477

490478
list_splice_init(&src->list, &dst->list);
@@ -494,7 +482,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
494482
set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
495483
src->nr_items = 0;
496484
}
497-
485+
out:
498486
spin_unlock_irq(&nlru->lock);
499487
}
500488

@@ -505,15 +493,41 @@ static void memcg_drain_list_lru(struct list_lru *lru,
505493

506494
for_each_node(i)
507495
memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
496+
497+
memcg_list_lru_free(lru, src_idx);
508498
}
509499

510-
void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
500+
void memcg_drain_all_list_lrus(struct mem_cgroup *src, struct mem_cgroup *dst)
511501
{
502+
struct cgroup_subsys_state *css;
512503
struct list_lru *lru;
504+
int src_idx = src->kmemcg_id;
505+
506+
/*
507+
* Change kmemcg_id of this cgroup and all its descendants to the
508+
* parent's id, and then move all entries from this cgroup's list_lrus
509+
* to ones of the parent.
510+
*
511+
* After we have finished, all list_lrus corresponding to this cgroup
512+
* are guaranteed to remain empty. So we can safely free this cgroup's
513+
* list lrus in memcg_list_lru_free().
514+
*
515+
* Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc()
516+
* from allocating list lrus for this cgroup after memcg_list_lru_free()
517+
* call.
518+
*/
519+
rcu_read_lock();
520+
css_for_each_descendant_pre(css, &src->css) {
521+
struct mem_cgroup *memcg;
522+
523+
memcg = mem_cgroup_from_css(css);
524+
memcg->kmemcg_id = dst->kmemcg_id;
525+
}
526+
rcu_read_unlock();
513527

514528
mutex_lock(&list_lrus_mutex);
515529
list_for_each_entry(lru, &memcg_list_lrus, list)
516-
memcg_drain_list_lru(lru, src_idx, dst_memcg);
530+
memcg_drain_list_lru(lru, src_idx, dst);
517531
mutex_unlock(&list_lrus_mutex);
518532
}
519533

@@ -528,7 +542,7 @@ static bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
528542
return true;
529543

530544
rcu_read_lock();
531-
allocated = !!rcu_dereference(lru->mlrus)->mlru[idx];
545+
allocated = !!rcu_access_pointer(rcu_dereference(lru->mlrus)->mlru[idx]);
532546
rcu_read_unlock();
533547

534548
return allocated;
@@ -576,11 +590,12 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
576590
mlrus = rcu_dereference_protected(lru->mlrus, true);
577591
while (i--) {
578592
int index = table[i].memcg->kmemcg_id;
593+
struct list_lru_per_memcg *mlru = table[i].mlru;
579594

580-
if (mlrus->mlru[index])
581-
kfree(table[i].mlru);
595+
if (index < 0 || rcu_dereference_protected(mlrus->mlru[index], true))
596+
kfree(mlru);
582597
else
583-
mlrus->mlru[index] = table[i].mlru;
598+
rcu_assign_pointer(mlrus->mlru[index], mlru);
584599
}
585600
spin_unlock_irqrestore(&lru->lock, flags);
586601

mm/memcontrol.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3709,6 +3709,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
37093709

37103710
memcg_reparent_objcgs(memcg, parent);
37113711

3712+
/*
3713+
* memcg_drain_all_list_lrus() can change memcg->kmemcg_id.
3714+
* Cache it to local @kmemcg_id.
3715+
*/
37123716
kmemcg_id = memcg->kmemcg_id;
37133717

37143718
/*
@@ -3717,7 +3721,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
37173721
* The ordering is imposed by list_lru_node->lock taken by
37183722
* memcg_drain_all_list_lrus().
37193723
*/
3720-
memcg_drain_all_list_lrus(kmemcg_id, parent);
3724+
memcg_drain_all_list_lrus(memcg, parent);
37213725

37223726
memcg_free_cache_id(kmemcg_id);
37233727
}

0 commit comments

Comments
 (0)