Skip to content

Commit bd0e749

Browse files
committed
mm, slub: convert kmem_cpu_slab protection to local_lock
Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's equivalent, with better lockdep visibility. On PREEMPT_RT that means better preemption. However, the cost on PREEMPT_RT is the loss of lockless fast paths which only work with cpu freelist. Those are designed to detect and recover from being preempted by other conflicting operations (both fast or slow path), but the slow path operations assume they cannot be preempted by a fast path operation, which is guaranteed naturally with disabled irqs. With local locks on PREEMPT_RT, the fast paths now also need to take the local lock to avoid races. In the allocation fastpath slab_alloc_node() we can just defer to the slowpath __slab_alloc() which also works with cpu freelist, but under the local lock. In the free fastpath do_slab_free() we have to add a new local lock protected version of freeing to the cpu freelist, as the existing slowpath only works with the page freelist. Also update the comment about locking scheme in SLUB to reflect changes done by this series. [ Mike Galbraith <[email protected]>: use local_lock() without irq in PREEMPT_RT scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ] Signed-off-by: Vlastimil Babka <[email protected]>
1 parent 25c00c5 commit bd0e749

File tree

2 files changed

+117
-35
lines changed

2 files changed

+117
-35
lines changed

include/linux/slub_def.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <linux/kfence.h>
1111
#include <linux/kobject.h>
1212
#include <linux/reciprocal_div.h>
13+
#include <linux/local_lock.h>
1314

1415
enum stat_item {
1516
ALLOC_FASTPATH, /* Allocation from cpu slab */
@@ -40,13 +41,18 @@ enum stat_item {
4041
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
4142
NR_SLUB_STAT_ITEMS };
4243

44+
/*
45+
* When changing the layout, make sure freelist and tid are still compatible
46+
* with this_cpu_cmpxchg_double() alignment requirements.
47+
*/
4348
struct kmem_cache_cpu {
4449
void **freelist; /* Pointer to next available object */
4550
unsigned long tid; /* Globally unique transaction id */
4651
struct page *page; /* The slab from which we are allocating */
4752
#ifdef CONFIG_SLUB_CPU_PARTIAL
4853
struct page *partial; /* Partially allocated frozen slabs */
4954
#endif
55+
local_lock_t lock; /* Protects the fields above */
5056
#ifdef CONFIG_SLUB_STATS
5157
unsigned stat[NR_SLUB_STAT_ITEMS];
5258
#endif

mm/slub.c

Lines changed: 111 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,21 @@
4646
/*
4747
* Lock order:
4848
* 1. slab_mutex (Global Mutex)
49-
* 2. node->list_lock
50-
* 3. slab_lock(page) (Only on some arches and for debugging)
49+
* 2. node->list_lock (Spinlock)
50+
* 3. kmem_cache->cpu_slab->lock (Local lock)
51+
* 4. slab_lock(page) (Only on some arches or for debugging)
52+
* 5. object_map_lock (Only for debugging)
5153
*
5254
* slab_mutex
5355
*
5456
* The role of the slab_mutex is to protect the list of all the slabs
5557
* and to synchronize major metadata changes to slab cache structures.
58+
* Also synchronizes memory hotplug callbacks.
59+
*
60+
* slab_lock
61+
*
62+
* The slab_lock is a wrapper around the page lock, thus it is a bit
63+
* spinlock.
5664
*
5765
* The slab_lock is only used for debugging and on arches that do not
5866
* have the ability to do a cmpxchg_double. It only protects:
@@ -61,13 +69,17 @@
6169
* C. page->objects -> Number of objects in page
6270
* D. page->frozen -> frozen state
6371
*
72+
* Frozen slabs
73+
*
6474
* If a slab is frozen then it is exempt from list management. It is not
6575
* on any list except per cpu partial list. The processor that froze the
6676
* slab is the one who can perform list operations on the page. Other
6777
* processors may put objects onto the freelist but the processor that
6878
* froze the slab is the only one that can retrieve the objects from the
6979
* page's freelist.
7080
*
81+
* list_lock
82+
*
7183
* The list_lock protects the partial and full list on each node and
7284
* the partial slab counter. If taken then no new slabs may be added or
7385
* removed from the lists nor make the number of partial slabs be modified.
@@ -79,10 +91,36 @@
7991
* slabs, operations can continue without any centralized lock. F.e.
8092
* allocating a long series of objects that fill up slabs does not require
8193
* the list lock.
82-
* Interrupts are disabled during allocation and deallocation in order to
83-
* make the slab allocator safe to use in the context of an irq. In addition
84-
* interrupts are disabled to ensure that the processor does not change
85-
* while handling per_cpu slabs, due to kernel preemption.
94+
*
95+
* cpu_slab->lock local lock
96+
*
97+
* This locks protect slowpath manipulation of all kmem_cache_cpu fields
98+
* except the stat counters. This is a percpu structure manipulated only by
99+
* the local cpu, so the lock protects against being preempted or interrupted
100+
* by an irq. Fast path operations rely on lockless operations instead.
101+
* On PREEMPT_RT, the local lock does not actually disable irqs (and thus
102+
* prevent the lockless operations), so fastpath operations also need to take
103+
* the lock and are no longer lockless.
104+
*
105+
* lockless fastpaths
106+
*
107+
* The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
108+
* are fully lockless when satisfied from the percpu slab (and when
109+
* cmpxchg_double is possible to use, otherwise slab_lock is taken).
110+
* They also don't disable preemption or migration or irqs. They rely on
111+
* the transaction id (tid) field to detect being preempted or moved to
112+
* another cpu.
113+
*
114+
* irq, preemption, migration considerations
115+
*
116+
* Interrupts are disabled as part of list_lock or local_lock operations, or
117+
* around the slab_lock operation, in order to make the slab allocator safe
118+
* to use in the context of an irq.
119+
*
120+
* In addition, preemption (or migration on PREEMPT_RT) is disabled in the
121+
* allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
122+
* local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
123+
* doesn't have to be revalidated in each section protected by the local lock.
86124
*
87125
* SLUB assigns one slab for allocation to each processor.
88126
* Allocations only occur from these slabs called cpu slabs.
@@ -2250,9 +2288,13 @@ static inline void note_cmpxchg_failure(const char *n,
22502288
static void init_kmem_cache_cpus(struct kmem_cache *s)
22512289
{
22522290
int cpu;
2291+
struct kmem_cache_cpu *c;
22532292

2254-
for_each_possible_cpu(cpu)
2255-
per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
2293+
for_each_possible_cpu(cpu) {
2294+
c = per_cpu_ptr(s->cpu_slab, cpu);
2295+
local_lock_init(&c->lock);
2296+
c->tid = init_tid(cpu);
2297+
}
22562298
}
22572299

22582300
/*
@@ -2463,10 +2505,10 @@ static void unfreeze_partials(struct kmem_cache *s)
24632505
struct page *partial_page;
24642506
unsigned long flags;
24652507

2466-
local_irq_save(flags);
2508+
local_lock_irqsave(&s->cpu_slab->lock, flags);
24672509
partial_page = this_cpu_read(s->cpu_slab->partial);
24682510
this_cpu_write(s->cpu_slab->partial, NULL);
2469-
local_irq_restore(flags);
2511+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
24702512

24712513
if (partial_page)
24722514
__unfreeze_partials(s, partial_page);
@@ -2499,7 +2541,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
24992541
int pages = 0;
25002542
int pobjects = 0;
25012543

2502-
local_irq_save(flags);
2544+
local_lock_irqsave(&s->cpu_slab->lock, flags);
25032545

25042546
oldpage = this_cpu_read(s->cpu_slab->partial);
25052547

@@ -2527,7 +2569,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
25272569

25282570
this_cpu_write(s->cpu_slab->partial, page);
25292571

2530-
local_irq_restore(flags);
2572+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
25312573

25322574
if (page_to_unfreeze) {
25332575
__unfreeze_partials(s, page_to_unfreeze);
@@ -2549,7 +2591,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
25492591
struct page *page;
25502592
void *freelist;
25512593

2552-
local_irq_save(flags);
2594+
local_lock_irqsave(&s->cpu_slab->lock, flags);
25532595

25542596
page = c->page;
25552597
freelist = c->freelist;
@@ -2558,7 +2600,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
25582600
c->freelist = NULL;
25592601
c->tid = next_tid(c->tid);
25602602

2561-
local_irq_restore(flags);
2603+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
25622604

25632605
if (page) {
25642606
deactivate_slab(s, page, freelist);
@@ -2780,15 +2822,15 @@ static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags)
27802822
* The page is still frozen if the return value is not NULL.
27812823
*
27822824
* If this function returns NULL then the page has been unfrozen.
2783-
*
2784-
* This function must be called with interrupt disabled.
27852825
*/
27862826
static inline void *get_freelist(struct kmem_cache *s, struct page *page)
27872827
{
27882828
struct page new;
27892829
unsigned long counters;
27902830
void *freelist;
27912831

2832+
lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
2833+
27922834
do {
27932835
freelist = page->freelist;
27942836
counters = page->counters;
@@ -2873,9 +2915,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
28732915
goto deactivate_slab;
28742916

28752917
/* must check again c->page in case we got preempted and it changed */
2876-
local_irq_save(flags);
2918+
local_lock_irqsave(&s->cpu_slab->lock, flags);
28772919
if (unlikely(page != c->page)) {
2878-
local_irq_restore(flags);
2920+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
28792921
goto reread_page;
28802922
}
28812923
freelist = c->freelist;
@@ -2886,7 +2928,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
28862928

28872929
if (!freelist) {
28882930
c->page = NULL;
2889-
local_irq_restore(flags);
2931+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
28902932
stat(s, DEACTIVATE_BYPASS);
28912933
goto new_slab;
28922934
}
@@ -2895,7 +2937,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
28952937

28962938
load_freelist:
28972939

2898-
lockdep_assert_irqs_disabled();
2940+
lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
28992941

29002942
/*
29012943
* freelist is pointing to the list of objects to be used.
@@ -2905,39 +2947,39 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29052947
VM_BUG_ON(!c->page->frozen);
29062948
c->freelist = get_freepointer(s, freelist);
29072949
c->tid = next_tid(c->tid);
2908-
local_irq_restore(flags);
2950+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
29092951
return freelist;
29102952

29112953
deactivate_slab:
29122954

2913-
local_irq_save(flags);
2955+
local_lock_irqsave(&s->cpu_slab->lock, flags);
29142956
if (page != c->page) {
2915-
local_irq_restore(flags);
2957+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
29162958
goto reread_page;
29172959
}
29182960
freelist = c->freelist;
29192961
c->page = NULL;
29202962
c->freelist = NULL;
2921-
local_irq_restore(flags);
2963+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
29222964
deactivate_slab(s, page, freelist);
29232965

29242966
new_slab:
29252967

29262968
if (slub_percpu_partial(c)) {
2927-
local_irq_save(flags);
2969+
local_lock_irqsave(&s->cpu_slab->lock, flags);
29282970
if (unlikely(c->page)) {
2929-
local_irq_restore(flags);
2971+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
29302972
goto reread_page;
29312973
}
29322974
if (unlikely(!slub_percpu_partial(c))) {
2933-
local_irq_restore(flags);
2975+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
29342976
/* we were preempted and partial list got empty */
29352977
goto new_objects;
29362978
}
29372979

29382980
page = c->page = slub_percpu_partial(c);
29392981
slub_set_percpu_partial(c, page);
2940-
local_irq_restore(flags);
2982+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
29412983
stat(s, CPU_PARTIAL_ALLOC);
29422984
goto redo;
29432985
}
@@ -2990,7 +3032,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29903032

29913033
retry_load_page:
29923034

2993-
local_irq_save(flags);
3035+
local_lock_irqsave(&s->cpu_slab->lock, flags);
29943036
if (unlikely(c->page)) {
29953037
void *flush_freelist = c->freelist;
29963038
struct page *flush_page = c->page;
@@ -2999,7 +3041,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
29993041
c->freelist = NULL;
30003042
c->tid = next_tid(c->tid);
30013043

3002-
local_irq_restore(flags);
3044+
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
30033045

30043046
deactivate_slab(s, flush_page, flush_freelist);
30053047

@@ -3118,7 +3160,15 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
31183160

31193161
object = c->freelist;
31203162
page = c->page;
3121-
if (unlikely(!object || !page || !node_match(page, node))) {
3163+
/*
3164+
* We cannot use the lockless fastpath on PREEMPT_RT because if a
3165+
* slowpath has taken the local_lock_irqsave(), it is not protected
3166+
* against a fast path operation in an irq handler. So we need to take
3167+
* the slow path which uses local_lock. It is still relatively fast if
3168+
* there is a suitable cpu freelist.
3169+
*/
3170+
if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
3171+
unlikely(!object || !page || !node_match(page, node))) {
31223172
object = __slab_alloc(s, gfpflags, node, addr, c);
31233173
} else {
31243174
void *next_object = get_freepointer_safe(s, object);
@@ -3378,6 +3428,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
33783428
barrier();
33793429

33803430
if (likely(page == c->page)) {
3431+
#ifndef CONFIG_PREEMPT_RT
33813432
void **freelist = READ_ONCE(c->freelist);
33823433

33833434
set_freepointer(s, tail_obj, freelist);
@@ -3390,6 +3441,31 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
33903441
note_cmpxchg_failure("slab_free", s, tid);
33913442
goto redo;
33923443
}
3444+
#else /* CONFIG_PREEMPT_RT */
3445+
/*
3446+
* We cannot use the lockless fastpath on PREEMPT_RT because if
3447+
* a slowpath has taken the local_lock_irqsave(), it is not
3448+
* protected against a fast path operation in an irq handler. So
3449+
* we need to take the local_lock. We shouldn't simply defer to
3450+
* __slab_free() as that wouldn't use the cpu freelist at all.
3451+
*/
3452+
void **freelist;
3453+
3454+
local_lock(&s->cpu_slab->lock);
3455+
c = this_cpu_ptr(s->cpu_slab);
3456+
if (unlikely(page != c->page)) {
3457+
local_unlock(&s->cpu_slab->lock);
3458+
goto redo;
3459+
}
3460+
tid = c->tid;
3461+
freelist = c->freelist;
3462+
3463+
set_freepointer(s, tail_obj, freelist);
3464+
c->freelist = head;
3465+
c->tid = next_tid(tid);
3466+
3467+
local_unlock(&s->cpu_slab->lock);
3468+
#endif
33933469
stat(s, FREE_FASTPATH);
33943470
} else
33953471
__slab_free(s, page, head, tail_obj, cnt, addr);
@@ -3568,7 +3644,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
35683644
* handlers invoking normal fastpath.
35693645
*/
35703646
c = slub_get_cpu_ptr(s->cpu_slab);
3571-
local_irq_disable();
3647+
local_lock_irq(&s->cpu_slab->lock);
35723648

35733649
for (i = 0; i < size; i++) {
35743650
void *object = kfence_alloc(s, s->object_size, flags);
@@ -3589,7 +3665,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
35893665
*/
35903666
c->tid = next_tid(c->tid);
35913667

3592-
local_irq_enable();
3668+
local_unlock_irq(&s->cpu_slab->lock);
35933669

35943670
/*
35953671
* Invoking slow path likely have side-effect
@@ -3603,7 +3679,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36033679
c = this_cpu_ptr(s->cpu_slab);
36043680
maybe_wipe_obj_freeptr(s, p[i]);
36053681

3606-
local_irq_disable();
3682+
local_lock_irq(&s->cpu_slab->lock);
36073683

36083684
continue; /* goto for-loop */
36093685
}
@@ -3612,7 +3688,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
36123688
maybe_wipe_obj_freeptr(s, p[i]);
36133689
}
36143690
c->tid = next_tid(c->tid);
3615-
local_irq_enable();
3691+
local_unlock_irq(&s->cpu_slab->lock);
36163692
slub_put_cpu_ptr(s->cpu_slab);
36173693

36183694
/*

0 commit comments

Comments
 (0)