46
46
/*
47
47
* Lock order:
48
48
* 1. slab_mutex (Global Mutex)
49
- * 2. node->list_lock
50
- * 3. slab_lock(page) (Only on some arches and for debugging)
49
+ * 2. node->list_lock (Spinlock)
50
+ * 3. kmem_cache->cpu_slab->lock (Local lock)
51
+ * 4. slab_lock(page) (Only on some arches or for debugging)
52
+ * 5. object_map_lock (Only for debugging)
51
53
*
52
54
* slab_mutex
53
55
*
54
56
* The role of the slab_mutex is to protect the list of all the slabs
55
57
* and to synchronize major metadata changes to slab cache structures.
58
+ * Also synchronizes memory hotplug callbacks.
59
+ *
60
+ * slab_lock
61
+ *
62
+ * The slab_lock is a wrapper around the page lock, thus it is a bit
63
+ * spinlock.
56
64
*
57
65
* The slab_lock is only used for debugging and on arches that do not
58
66
* have the ability to do a cmpxchg_double. It only protects:
61
69
* C. page->objects -> Number of objects in page
62
70
* D. page->frozen -> frozen state
63
71
*
72
+ * Frozen slabs
73
+ *
64
74
* If a slab is frozen then it is exempt from list management. It is not
65
75
* on any list except per cpu partial list. The processor that froze the
66
76
* slab is the one who can perform list operations on the page. Other
67
77
* processors may put objects onto the freelist but the processor that
68
78
* froze the slab is the only one that can retrieve the objects from the
69
79
* page's freelist.
70
80
*
81
+ * list_lock
82
+ *
71
83
* The list_lock protects the partial and full list on each node and
72
84
* the partial slab counter. If taken then no new slabs may be added or
73
85
* removed from the lists nor make the number of partial slabs be modified.
79
91
* slabs, operations can continue without any centralized lock. F.e.
80
92
* allocating a long series of objects that fill up slabs does not require
81
93
* the list lock.
82
- * Interrupts are disabled during allocation and deallocation in order to
83
- * make the slab allocator safe to use in the context of an irq. In addition
84
- * interrupts are disabled to ensure that the processor does not change
85
- * while handling per_cpu slabs, due to kernel preemption.
94
+ *
95
+ * cpu_slab->lock local lock
96
+ *
97
+ * This locks protect slowpath manipulation of all kmem_cache_cpu fields
98
+ * except the stat counters. This is a percpu structure manipulated only by
99
+ * the local cpu, so the lock protects against being preempted or interrupted
100
+ * by an irq. Fast path operations rely on lockless operations instead.
101
+ * On PREEMPT_RT, the local lock does not actually disable irqs (and thus
102
+ * prevent the lockless operations), so fastpath operations also need to take
103
+ * the lock and are no longer lockless.
104
+ *
105
+ * lockless fastpaths
106
+ *
107
+ * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
108
+ * are fully lockless when satisfied from the percpu slab (and when
109
+ * cmpxchg_double is possible to use, otherwise slab_lock is taken).
110
+ * They also don't disable preemption or migration or irqs. They rely on
111
+ * the transaction id (tid) field to detect being preempted or moved to
112
+ * another cpu.
113
+ *
114
+ * irq, preemption, migration considerations
115
+ *
116
+ * Interrupts are disabled as part of list_lock or local_lock operations, or
117
+ * around the slab_lock operation, in order to make the slab allocator safe
118
+ * to use in the context of an irq.
119
+ *
120
+ * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
121
+ * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
122
+ * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
123
+ * doesn't have to be revalidated in each section protected by the local lock.
86
124
*
87
125
* SLUB assigns one slab for allocation to each processor.
88
126
* Allocations only occur from these slabs called cpu slabs.
@@ -2250,9 +2288,13 @@ static inline void note_cmpxchg_failure(const char *n,
2250
2288
static void init_kmem_cache_cpus (struct kmem_cache * s )
2251
2289
{
2252
2290
int cpu ;
2291
+ struct kmem_cache_cpu * c ;
2253
2292
2254
- for_each_possible_cpu (cpu )
2255
- per_cpu_ptr (s -> cpu_slab , cpu )-> tid = init_tid (cpu );
2293
+ for_each_possible_cpu (cpu ) {
2294
+ c = per_cpu_ptr (s -> cpu_slab , cpu );
2295
+ local_lock_init (& c -> lock );
2296
+ c -> tid = init_tid (cpu );
2297
+ }
2256
2298
}
2257
2299
2258
2300
/*
@@ -2463,10 +2505,10 @@ static void unfreeze_partials(struct kmem_cache *s)
2463
2505
struct page * partial_page ;
2464
2506
unsigned long flags ;
2465
2507
2466
- local_irq_save ( flags );
2508
+ local_lock_irqsave ( & s -> cpu_slab -> lock , flags );
2467
2509
partial_page = this_cpu_read (s -> cpu_slab -> partial );
2468
2510
this_cpu_write (s -> cpu_slab -> partial , NULL );
2469
- local_irq_restore ( flags );
2511
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2470
2512
2471
2513
if (partial_page )
2472
2514
__unfreeze_partials (s , partial_page );
@@ -2499,7 +2541,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2499
2541
int pages = 0 ;
2500
2542
int pobjects = 0 ;
2501
2543
2502
- local_irq_save ( flags );
2544
+ local_lock_irqsave ( & s -> cpu_slab -> lock , flags );
2503
2545
2504
2546
oldpage = this_cpu_read (s -> cpu_slab -> partial );
2505
2547
@@ -2527,7 +2569,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2527
2569
2528
2570
this_cpu_write (s -> cpu_slab -> partial , page );
2529
2571
2530
- local_irq_restore ( flags );
2572
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2531
2573
2532
2574
if (page_to_unfreeze ) {
2533
2575
__unfreeze_partials (s , page_to_unfreeze );
@@ -2549,7 +2591,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2549
2591
struct page * page ;
2550
2592
void * freelist ;
2551
2593
2552
- local_irq_save ( flags );
2594
+ local_lock_irqsave ( & s -> cpu_slab -> lock , flags );
2553
2595
2554
2596
page = c -> page ;
2555
2597
freelist = c -> freelist ;
@@ -2558,7 +2600,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
2558
2600
c -> freelist = NULL ;
2559
2601
c -> tid = next_tid (c -> tid );
2560
2602
2561
- local_irq_restore ( flags );
2603
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2562
2604
2563
2605
if (page ) {
2564
2606
deactivate_slab (s , page , freelist );
@@ -2780,15 +2822,15 @@ static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags)
2780
2822
* The page is still frozen if the return value is not NULL.
2781
2823
*
2782
2824
* If this function returns NULL then the page has been unfrozen.
2783
- *
2784
- * This function must be called with interrupt disabled.
2785
2825
*/
2786
2826
static inline void * get_freelist (struct kmem_cache * s , struct page * page )
2787
2827
{
2788
2828
struct page new ;
2789
2829
unsigned long counters ;
2790
2830
void * freelist ;
2791
2831
2832
+ lockdep_assert_held (this_cpu_ptr (& s -> cpu_slab -> lock ));
2833
+
2792
2834
do {
2793
2835
freelist = page -> freelist ;
2794
2836
counters = page -> counters ;
@@ -2873,9 +2915,9 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2873
2915
goto deactivate_slab ;
2874
2916
2875
2917
/* must check again c->page in case we got preempted and it changed */
2876
- local_irq_save ( flags );
2918
+ local_lock_irqsave ( & s -> cpu_slab -> lock , flags );
2877
2919
if (unlikely (page != c -> page )) {
2878
- local_irq_restore ( flags );
2920
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2879
2921
goto reread_page ;
2880
2922
}
2881
2923
freelist = c -> freelist ;
@@ -2886,7 +2928,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2886
2928
2887
2929
if (!freelist ) {
2888
2930
c -> page = NULL ;
2889
- local_irq_restore ( flags );
2931
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2890
2932
stat (s , DEACTIVATE_BYPASS );
2891
2933
goto new_slab ;
2892
2934
}
@@ -2895,7 +2937,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2895
2937
2896
2938
load_freelist :
2897
2939
2898
- lockdep_assert_irqs_disabled ( );
2940
+ lockdep_assert_held ( this_cpu_ptr ( & s -> cpu_slab -> lock ) );
2899
2941
2900
2942
/*
2901
2943
* freelist is pointing to the list of objects to be used.
@@ -2905,39 +2947,39 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2905
2947
VM_BUG_ON (!c -> page -> frozen );
2906
2948
c -> freelist = get_freepointer (s , freelist );
2907
2949
c -> tid = next_tid (c -> tid );
2908
- local_irq_restore ( flags );
2950
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2909
2951
return freelist ;
2910
2952
2911
2953
deactivate_slab :
2912
2954
2913
- local_irq_save ( flags );
2955
+ local_lock_irqsave ( & s -> cpu_slab -> lock , flags );
2914
2956
if (page != c -> page ) {
2915
- local_irq_restore ( flags );
2957
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2916
2958
goto reread_page ;
2917
2959
}
2918
2960
freelist = c -> freelist ;
2919
2961
c -> page = NULL ;
2920
2962
c -> freelist = NULL ;
2921
- local_irq_restore ( flags );
2963
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2922
2964
deactivate_slab (s , page , freelist );
2923
2965
2924
2966
new_slab :
2925
2967
2926
2968
if (slub_percpu_partial (c )) {
2927
- local_irq_save ( flags );
2969
+ local_lock_irqsave ( & s -> cpu_slab -> lock , flags );
2928
2970
if (unlikely (c -> page )) {
2929
- local_irq_restore ( flags );
2971
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2930
2972
goto reread_page ;
2931
2973
}
2932
2974
if (unlikely (!slub_percpu_partial (c ))) {
2933
- local_irq_restore ( flags );
2975
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2934
2976
/* we were preempted and partial list got empty */
2935
2977
goto new_objects ;
2936
2978
}
2937
2979
2938
2980
page = c -> page = slub_percpu_partial (c );
2939
2981
slub_set_percpu_partial (c , page );
2940
- local_irq_restore ( flags );
2982
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
2941
2983
stat (s , CPU_PARTIAL_ALLOC );
2942
2984
goto redo ;
2943
2985
}
@@ -2990,7 +3032,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2990
3032
2991
3033
retry_load_page :
2992
3034
2993
- local_irq_save ( flags );
3035
+ local_lock_irqsave ( & s -> cpu_slab -> lock , flags );
2994
3036
if (unlikely (c -> page )) {
2995
3037
void * flush_freelist = c -> freelist ;
2996
3038
struct page * flush_page = c -> page ;
@@ -2999,7 +3041,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2999
3041
c -> freelist = NULL ;
3000
3042
c -> tid = next_tid (c -> tid );
3001
3043
3002
- local_irq_restore ( flags );
3044
+ local_unlock_irqrestore ( & s -> cpu_slab -> lock , flags );
3003
3045
3004
3046
deactivate_slab (s , flush_page , flush_freelist );
3005
3047
@@ -3118,7 +3160,15 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
3118
3160
3119
3161
object = c -> freelist ;
3120
3162
page = c -> page ;
3121
- if (unlikely (!object || !page || !node_match (page , node ))) {
3163
+ /*
3164
+ * We cannot use the lockless fastpath on PREEMPT_RT because if a
3165
+ * slowpath has taken the local_lock_irqsave(), it is not protected
3166
+ * against a fast path operation in an irq handler. So we need to take
3167
+ * the slow path which uses local_lock. It is still relatively fast if
3168
+ * there is a suitable cpu freelist.
3169
+ */
3170
+ if (IS_ENABLED (CONFIG_PREEMPT_RT ) ||
3171
+ unlikely (!object || !page || !node_match (page , node ))) {
3122
3172
object = __slab_alloc (s , gfpflags , node , addr , c );
3123
3173
} else {
3124
3174
void * next_object = get_freepointer_safe (s , object );
@@ -3378,6 +3428,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
3378
3428
barrier ();
3379
3429
3380
3430
if (likely (page == c -> page )) {
3431
+ #ifndef CONFIG_PREEMPT_RT
3381
3432
void * * freelist = READ_ONCE (c -> freelist );
3382
3433
3383
3434
set_freepointer (s , tail_obj , freelist );
@@ -3390,6 +3441,31 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
3390
3441
note_cmpxchg_failure ("slab_free" , s , tid );
3391
3442
goto redo ;
3392
3443
}
3444
+ #else /* CONFIG_PREEMPT_RT */
3445
+ /*
3446
+ * We cannot use the lockless fastpath on PREEMPT_RT because if
3447
+ * a slowpath has taken the local_lock_irqsave(), it is not
3448
+ * protected against a fast path operation in an irq handler. So
3449
+ * we need to take the local_lock. We shouldn't simply defer to
3450
+ * __slab_free() as that wouldn't use the cpu freelist at all.
3451
+ */
3452
+ void * * freelist ;
3453
+
3454
+ local_lock (& s -> cpu_slab -> lock );
3455
+ c = this_cpu_ptr (s -> cpu_slab );
3456
+ if (unlikely (page != c -> page )) {
3457
+ local_unlock (& s -> cpu_slab -> lock );
3458
+ goto redo ;
3459
+ }
3460
+ tid = c -> tid ;
3461
+ freelist = c -> freelist ;
3462
+
3463
+ set_freepointer (s , tail_obj , freelist );
3464
+ c -> freelist = head ;
3465
+ c -> tid = next_tid (tid );
3466
+
3467
+ local_unlock (& s -> cpu_slab -> lock );
3468
+ #endif
3393
3469
stat (s , FREE_FASTPATH );
3394
3470
} else
3395
3471
__slab_free (s , page , head , tail_obj , cnt , addr );
@@ -3568,7 +3644,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3568
3644
* handlers invoking normal fastpath.
3569
3645
*/
3570
3646
c = slub_get_cpu_ptr (s -> cpu_slab );
3571
- local_irq_disable ( );
3647
+ local_lock_irq ( & s -> cpu_slab -> lock );
3572
3648
3573
3649
for (i = 0 ; i < size ; i ++ ) {
3574
3650
void * object = kfence_alloc (s , s -> object_size , flags );
@@ -3589,7 +3665,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3589
3665
*/
3590
3666
c -> tid = next_tid (c -> tid );
3591
3667
3592
- local_irq_enable ( );
3668
+ local_unlock_irq ( & s -> cpu_slab -> lock );
3593
3669
3594
3670
/*
3595
3671
* Invoking slow path likely have side-effect
@@ -3603,7 +3679,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3603
3679
c = this_cpu_ptr (s -> cpu_slab );
3604
3680
maybe_wipe_obj_freeptr (s , p [i ]);
3605
3681
3606
- local_irq_disable ( );
3682
+ local_lock_irq ( & s -> cpu_slab -> lock );
3607
3683
3608
3684
continue ; /* goto for-loop */
3609
3685
}
@@ -3612,7 +3688,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3612
3688
maybe_wipe_obj_freeptr (s , p [i ]);
3613
3689
}
3614
3690
c -> tid = next_tid (c -> tid );
3615
- local_irq_enable ( );
3691
+ local_unlock_irq ( & s -> cpu_slab -> lock );
3616
3692
slub_put_cpu_ptr (s -> cpu_slab );
3617
3693
3618
3694
/*
0 commit comments