Skip to content

Commit f0a3a24

Browse files
rgushchintorvalds
authored andcommitted
mm: memcg/slab: rework non-root kmem_cache lifecycle management
Currently each charged slab page holds a reference to the cgroup to which it's charged. Kmem_caches are held by the memcg and are released all together with the memory cgroup. It means that none of kmem_caches are released unless at least one reference to the memcg exists, which is very far from optimal. Let's rework it in a way that allows releasing individual kmem_caches as soon as the cgroup is offline, the kmem_cache is empty and there are no pending allocations. To make it possible, let's introduce a new percpu refcounter for non-root kmem caches. The counter is initialized to the percpu mode, and is switched to the atomic mode during kmem_cache deactivation. The counter is bumped for every charged page and also for every running allocation. So the kmem_cache can't be released unless all allocations complete. To shutdown non-active empty kmem_caches, let's reuse the work queue, previously used for the kmem_cache deactivation. Once the reference counter reaches 0, let's schedule an asynchronous kmem_cache release. * I used the following simple approach to test the performance (stolen from another patchset by T. Harding): time find / -name fname-no-exist echo 2 > /proc/sys/vm/drop_caches repeat 10 times Results: orig patched real 0m1.455s real 0m1.355s user 0m0.206s user 0m0.219s sys 0m0.855s sys 0m0.807s real 0m1.487s real 0m1.699s user 0m0.221s user 0m0.256s sys 0m0.806s sys 0m0.948s real 0m1.515s real 0m1.505s user 0m0.183s user 0m0.215s sys 0m0.876s sys 0m0.858s real 0m1.291s real 0m1.380s user 0m0.193s user 0m0.198s sys 0m0.843s sys 0m0.786s real 0m1.364s real 0m1.374s user 0m0.180s user 0m0.182s sys 0m0.868s sys 0m0.806s real 0m1.352s real 0m1.312s user 0m0.201s user 0m0.212s sys 0m0.820s sys 0m0.761s real 0m1.302s real 0m1.349s user 0m0.205s user 0m0.203s sys 0m0.803s sys 0m0.792s real 0m1.334s real 0m1.301s user 0m0.194s user 0m0.201s sys 0m0.806s sys 0m0.779s real 0m1.426s real 0m1.434s user 0m0.216s user 0m0.181s sys 0m0.824s sys 0m0.864s real 0m1.350s real 0m1.295s user 0m0.200s user 0m0.190s sys 0m0.842s sys 0m0.811s So it looks like the difference is not noticeable in this test. [[email protected]: fix an use-after-free in kmemcg_workfn()] Link: http://lkml.kernel.org/r/[email protected] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Roman Gushchin <[email protected]> Signed-off-by: Qian Cai <[email protected]> Acked-by: Vladimir Davydov <[email protected]> Cc: Christoph Lameter <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Waiman Long <[email protected]> Cc: David Rientjes <[email protected]> Cc: Joonsoo Kim <[email protected]> Cc: Pekka Enberg <[email protected]> Cc: Andrei Vagin <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 63b02ef commit f0a3a24

File tree

4 files changed

+96
-79
lines changed

4 files changed

+96
-79
lines changed

include/linux/slab.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/overflow.h>
1717
#include <linux/types.h>
1818
#include <linux/workqueue.h>
19+
#include <linux/percpu-refcount.h>
1920

2021

2122
/*
@@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *);
152153

153154
void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
154155
void memcg_deactivate_kmem_caches(struct mem_cgroup *);
155-
void memcg_destroy_kmem_caches(struct mem_cgroup *);
156156

157157
/*
158158
* Please use this macro to create slab caches. Simply specify the
@@ -642,6 +642,7 @@ struct memcg_cache_params {
642642
struct mem_cgroup *memcg;
643643
struct list_head children_node;
644644
struct list_head kmem_caches_node;
645+
struct percpu_ref refcnt;
645646

646647
void (*work_fn)(struct kmem_cache *);
647648
union {

mm/memcontrol.c

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2667,12 +2667,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
26672667
{
26682668
struct memcg_kmem_cache_create_work *cw;
26692669

2670+
if (!css_tryget_online(&memcg->css))
2671+
return;
2672+
26702673
cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
26712674
if (!cw)
26722675
return;
26732676

2674-
css_get(&memcg->css);
2675-
26762677
cw->memcg = memcg;
26772678
cw->cachep = cachep;
26782679
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2707,21 +2708,36 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
27072708
{
27082709
struct mem_cgroup *memcg;
27092710
struct kmem_cache *memcg_cachep;
2711+
struct memcg_cache_array *arr;
27102712
int kmemcg_id;
27112713

27122714
VM_BUG_ON(!is_root_cache(cachep));
27132715

27142716
if (memcg_kmem_bypass())
27152717
return cachep;
27162718

2717-
memcg = get_mem_cgroup_from_current();
2719+
rcu_read_lock();
2720+
2721+
if (unlikely(current->active_memcg))
2722+
memcg = current->active_memcg;
2723+
else
2724+
memcg = mem_cgroup_from_task(current);
2725+
2726+
if (!memcg || memcg == root_mem_cgroup)
2727+
goto out_unlock;
2728+
27182729
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
27192730
if (kmemcg_id < 0)
2720-
goto out;
2731+
goto out_unlock;
2732+
2733+
arr = rcu_dereference(cachep->memcg_params.memcg_caches);
27212734

2722-
memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2723-
if (likely(memcg_cachep))
2724-
return memcg_cachep;
2735+
/*
2736+
* Make sure we will access the up-to-date value. The code updating
2737+
* memcg_caches issues a write barrier to match the data dependency
2738+
* barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
2739+
*/
2740+
memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
27252741

27262742
/*
27272743
* If we are in a safe context (can wait, and not in interrupt
@@ -2734,10 +2750,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
27342750
* memcg_create_kmem_cache, this means no further allocation
27352751
* could happen with the slab_mutex held. So it's better to
27362752
* defer everything.
2753+
*
2754+
* If the memcg is dying or memcg_cache is about to be released,
2755+
* don't bother creating new kmem_caches. Because memcg_cachep
2756+
* is ZEROed as the fist step of kmem offlining, we don't need
2757+
* percpu_ref_tryget_live() here. css_tryget_online() check in
2758+
* memcg_schedule_kmem_cache_create() will prevent us from
2759+
* creation of a new kmem_cache.
27372760
*/
2738-
memcg_schedule_kmem_cache_create(memcg, cachep);
2739-
out:
2740-
css_put(&memcg->css);
2761+
if (unlikely(!memcg_cachep))
2762+
memcg_schedule_kmem_cache_create(memcg, cachep);
2763+
else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
2764+
cachep = memcg_cachep;
2765+
out_unlock:
2766+
rcu_read_unlock();
27412767
return cachep;
27422768
}
27432769

@@ -2748,7 +2774,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
27482774
void memcg_kmem_put_cache(struct kmem_cache *cachep)
27492775
{
27502776
if (!is_root_cache(cachep))
2751-
css_put(&cachep->memcg_params.memcg->css);
2777+
percpu_ref_put(&cachep->memcg_params.refcnt);
27522778
}
27532779

27542780
/**
@@ -3295,7 +3321,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
32953321
memcg_offline_kmem(memcg);
32963322

32973323
if (memcg->kmem_state == KMEM_ALLOCATED) {
3298-
memcg_destroy_kmem_caches(memcg);
3324+
WARN_ON(!list_empty(&memcg->kmem_caches));
32993325
static_branch_dec(&memcg_kmem_enabled_key);
33003326
WARN_ON(page_counter_read(&memcg->kmem));
33013327
}

mm/slab.h

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -248,31 +248,6 @@ static inline const char *cache_name(struct kmem_cache *s)
248248
return s->name;
249249
}
250250

251-
/*
252-
* Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
253-
* That said the caller must assure the memcg's cache won't go away by either
254-
* taking a css reference to the owner cgroup, or holding the slab_mutex.
255-
*/
256-
static inline struct kmem_cache *
257-
cache_from_memcg_idx(struct kmem_cache *s, int idx)
258-
{
259-
struct kmem_cache *cachep;
260-
struct memcg_cache_array *arr;
261-
262-
rcu_read_lock();
263-
arr = rcu_dereference(s->memcg_params.memcg_caches);
264-
265-
/*
266-
* Make sure we will access the up-to-date value. The code updating
267-
* memcg_caches issues a write barrier to match this (see
268-
* memcg_create_kmem_cache()).
269-
*/
270-
cachep = READ_ONCE(arr->entries[idx]);
271-
rcu_read_unlock();
272-
273-
return cachep;
274-
}
275-
276251
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
277252
{
278253
if (is_root_cache(s))
@@ -284,14 +259,25 @@ static __always_inline int memcg_charge_slab(struct page *page,
284259
gfp_t gfp, int order,
285260
struct kmem_cache *s)
286261
{
262+
int ret;
263+
287264
if (is_root_cache(s))
288265
return 0;
289-
return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
266+
267+
ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
268+
if (ret)
269+
return ret;
270+
271+
percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
272+
273+
return 0;
290274
}
291275

292276
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
293277
struct kmem_cache *s)
294278
{
279+
if (!is_root_cache(s))
280+
percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
295281
memcg_kmem_uncharge(page, order);
296282
}
297283

@@ -323,12 +309,6 @@ static inline const char *cache_name(struct kmem_cache *s)
323309
return s->name;
324310
}
325311

326-
static inline struct kmem_cache *
327-
cache_from_memcg_idx(struct kmem_cache *s, int idx)
328-
{
329-
return NULL;
330-
}
331-
332312
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
333313
{
334314
return s;

mm/slab_common.c

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
132132
LIST_HEAD(slab_root_caches);
133133
static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
134134

135+
static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
136+
135137
void slab_init_memcg_params(struct kmem_cache *s)
136138
{
137139
s->memcg_params.root_cache = NULL;
@@ -146,6 +148,12 @@ static int init_memcg_params(struct kmem_cache *s,
146148
struct memcg_cache_array *arr;
147149

148150
if (root_cache) {
151+
int ret = percpu_ref_init(&s->memcg_params.refcnt,
152+
kmemcg_cache_shutdown,
153+
0, GFP_KERNEL);
154+
if (ret)
155+
return ret;
156+
149157
s->memcg_params.root_cache = root_cache;
150158
INIT_LIST_HEAD(&s->memcg_params.children_node);
151159
INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
@@ -171,6 +179,8 @@ static void destroy_memcg_params(struct kmem_cache *s)
171179
{
172180
if (is_root_cache(s))
173181
kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
182+
else
183+
percpu_ref_exit(&s->memcg_params.refcnt);
174184
}
175185

176186
static void free_memcg_params(struct rcu_head *rcu)
@@ -226,6 +236,7 @@ void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
226236
if (is_root_cache(s)) {
227237
list_add(&s->root_caches_node, &slab_root_caches);
228238
} else {
239+
css_get(&memcg->css);
229240
s->memcg_params.memcg = memcg;
230241
list_add(&s->memcg_params.children_node,
231242
&s->memcg_params.root_cache->memcg_params.children);
@@ -241,6 +252,7 @@ static void memcg_unlink_cache(struct kmem_cache *s)
241252
} else {
242253
list_del(&s->memcg_params.children_node);
243254
list_del(&s->memcg_params.kmem_caches_node);
255+
css_put(&s->memcg_params.memcg->css);
244256
}
245257
}
246258
#else
@@ -678,7 +690,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
678690
}
679691

680692
/*
681-
* Since readers won't lock (see cache_from_memcg_idx()), we need a
693+
* Since readers won't lock (see memcg_kmem_get_cache()), we need a
682694
* barrier here to ensure nobody will see the kmem_cache partially
683695
* initialized.
684696
*/
@@ -701,16 +713,11 @@ static void kmemcg_workfn(struct work_struct *work)
701713
get_online_mems();
702714

703715
mutex_lock(&slab_mutex);
704-
705716
s->memcg_params.work_fn(s);
706-
707717
mutex_unlock(&slab_mutex);
708718

709719
put_online_mems();
710720
put_online_cpus();
711-
712-
/* done, put the ref from kmemcg_cache_deactivate() */
713-
css_put(&s->memcg_params.memcg->css);
714721
}
715722

716723
static void kmemcg_rcufn(struct rcu_head *head)
@@ -727,10 +734,38 @@ static void kmemcg_rcufn(struct rcu_head *head)
727734
queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
728735
}
729736

737+
static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
738+
{
739+
WARN_ON(shutdown_cache(s));
740+
}
741+
742+
static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
743+
{
744+
struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
745+
memcg_params.refcnt);
746+
unsigned long flags;
747+
748+
spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
749+
if (s->memcg_params.root_cache->memcg_params.dying)
750+
goto unlock;
751+
752+
s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
753+
INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
754+
queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
755+
756+
unlock:
757+
spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
758+
}
759+
760+
static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
761+
{
762+
__kmemcg_cache_deactivate_after_rcu(s);
763+
percpu_ref_kill(&s->memcg_params.refcnt);
764+
}
765+
730766
static void kmemcg_cache_deactivate(struct kmem_cache *s)
731767
{
732-
if (WARN_ON_ONCE(is_root_cache(s)) ||
733-
WARN_ON_ONCE(s->memcg_params.work_fn))
768+
if (WARN_ON_ONCE(is_root_cache(s)))
734769
return;
735770

736771
__kmemcg_cache_deactivate(s);
@@ -744,10 +779,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
744779
if (s->memcg_params.root_cache->memcg_params.dying)
745780
goto unlock;
746781

747-
/* pin memcg so that @s doesn't get destroyed in the middle */
748-
css_get(&s->memcg_params.memcg->css);
749-
750-
s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu;
782+
s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
751783
call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
752784
unlock:
753785
spin_unlock_irq(&memcg_kmem_wq_lock);
@@ -781,28 +813,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
781813
put_online_cpus();
782814
}
783815

784-
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
785-
{
786-
struct kmem_cache *s, *s2;
787-
788-
get_online_cpus();
789-
get_online_mems();
790-
791-
mutex_lock(&slab_mutex);
792-
list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
793-
memcg_params.kmem_caches_node) {
794-
/*
795-
* The cgroup is about to be freed and therefore has no charges
796-
* left. Hence, all its caches must be empty by now.
797-
*/
798-
BUG_ON(shutdown_cache(s));
799-
}
800-
mutex_unlock(&slab_mutex);
801-
802-
put_online_mems();
803-
put_online_cpus();
804-
}
805-
806816
static int shutdown_memcg_caches(struct kmem_cache *s)
807817
{
808818
struct memcg_cache_array *arr;

0 commit comments

Comments
 (0)