Skip to content

Commit 8a5ec0b

Browse files
Christoph Lameterpenberg
authored andcommitted
Lockless (and preemptless) fastpaths for slub
Use the this_cpu_cmpxchg_double functionality to implement a lockless allocation algorithm on arches that support fast this_cpu_ops. Each of the per cpu pointers is paired with a transaction id that ensures that updates of the per cpu information can only occur in sequence on a certain cpu. A transaction id is a "long" integer that is comprised of an event number and the cpu number. The event number is incremented for every change to the per cpu state. This means that the cmpxchg instruction can verify for an update that nothing interfered and that we are updating the percpu structure for the processor where we picked up the information and that we are also currently on that processor when we update the information. This results in a significant decrease of the overhead in the fastpaths. It also makes it easy to adopt the fast path for realtime kernels since this is lockless and does not require the use of the current per cpu area over the critical section. It is only important that the per cpu area is current at the beginning of the critical section and at the end. So there is no need even to disable preemption. Test results show that the fastpath cycle count is reduced by up to ~ 40% (alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree adds a few cycles. Sadly this does nothing for the slowpath which is where the main issues with performance in slub are but the best case performance rises significantly. (For that see the more complex slub patches that require cmpxchg_double) Kmalloc: alloc/free test Before: 10000 times kmalloc(8)/kfree -> 134 cycles 10000 times kmalloc(16)/kfree -> 152 cycles 10000 times kmalloc(32)/kfree -> 144 cycles 10000 times kmalloc(64)/kfree -> 142 cycles 10000 times kmalloc(128)/kfree -> 142 cycles 10000 times kmalloc(256)/kfree -> 132 cycles 10000 times kmalloc(512)/kfree -> 132 cycles 10000 times kmalloc(1024)/kfree -> 135 cycles 10000 times kmalloc(2048)/kfree -> 135 cycles 10000 times kmalloc(4096)/kfree -> 135 cycles 10000 times kmalloc(8192)/kfree -> 144 cycles 10000 times kmalloc(16384)/kfree -> 754 cycles After: 10000 times kmalloc(8)/kfree -> 78 cycles 10000 times kmalloc(16)/kfree -> 78 cycles 10000 times kmalloc(32)/kfree -> 82 cycles 10000 times kmalloc(64)/kfree -> 88 cycles 10000 times kmalloc(128)/kfree -> 79 cycles 10000 times kmalloc(256)/kfree -> 79 cycles 10000 times kmalloc(512)/kfree -> 85 cycles 10000 times kmalloc(1024)/kfree -> 82 cycles 10000 times kmalloc(2048)/kfree -> 82 cycles 10000 times kmalloc(4096)/kfree -> 85 cycles 10000 times kmalloc(8192)/kfree -> 82 cycles 10000 times kmalloc(16384)/kfree -> 706 cycles Kmalloc: Repeatedly allocate then free test Before: 10000 times kmalloc(8) -> 211 cycles kfree -> 113 cycles 10000 times kmalloc(16) -> 174 cycles kfree -> 115 cycles 10000 times kmalloc(32) -> 235 cycles kfree -> 129 cycles 10000 times kmalloc(64) -> 222 cycles kfree -> 120 cycles 10000 times kmalloc(128) -> 343 cycles kfree -> 139 cycles 10000 times kmalloc(256) -> 827 cycles kfree -> 147 cycles 10000 times kmalloc(512) -> 1048 cycles kfree -> 272 cycles 10000 times kmalloc(1024) -> 2043 cycles kfree -> 528 cycles 10000 times kmalloc(2048) -> 4002 cycles kfree -> 571 cycles 10000 times kmalloc(4096) -> 7740 cycles kfree -> 628 cycles 10000 times kmalloc(8192) -> 8062 cycles kfree -> 850 cycles 10000 times kmalloc(16384) -> 8895 cycles kfree -> 1249 cycles After: 10000 times kmalloc(8) -> 190 cycles kfree -> 129 cycles 10000 times kmalloc(16) -> 76 cycles kfree -> 123 cycles 10000 times kmalloc(32) -> 126 cycles kfree -> 124 cycles 10000 times kmalloc(64) -> 181 cycles kfree -> 128 cycles 10000 times kmalloc(128) -> 310 cycles kfree -> 140 cycles 10000 times kmalloc(256) -> 809 cycles kfree -> 165 cycles 10000 times kmalloc(512) -> 1005 cycles kfree -> 269 cycles 10000 times kmalloc(1024) -> 1999 cycles kfree -> 527 cycles 10000 times kmalloc(2048) -> 3967 cycles kfree -> 570 cycles 10000 times kmalloc(4096) -> 7658 cycles kfree -> 637 cycles 10000 times kmalloc(8192) -> 8111 cycles kfree -> 859 cycles 10000 times kmalloc(16384) -> 8791 cycles kfree -> 1173 cycles Signed-off-by: Christoph Lameter <[email protected]> Signed-off-by: Pekka Enberg <[email protected]>
1 parent d3f661d commit 8a5ec0b

File tree

2 files changed

+207
-3
lines changed

2 files changed

+207
-3
lines changed

include/linux/slub_def.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ enum stat_item {
3535
NR_SLUB_STAT_ITEMS };
3636

3737
struct kmem_cache_cpu {
38-
void **freelist; /* Pointer to first free per cpu object */
38+
void **freelist; /* Pointer to next available object */
39+
#ifdef CONFIG_CMPXCHG_LOCAL
40+
unsigned long tid; /* Globally unique transaction id */
41+
#endif
3942
struct page *page; /* The slab from which we are allocating */
4043
int node; /* The node of the page (or -1 for debug) */
4144
#ifdef CONFIG_SLUB_STATS

mm/slub.c

Lines changed: 203 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1494,6 +1494,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
14941494
}
14951495
}
14961496

1497+
#ifdef CONFIG_CMPXCHG_LOCAL
1498+
#ifdef CONFIG_PREEMPT
1499+
/*
1500+
* Calculate the next globally unique transaction for disambiguiation
1501+
* during cmpxchg. The transactions start with the cpu number and are then
1502+
* incremented by CONFIG_NR_CPUS.
1503+
*/
1504+
#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
1505+
#else
1506+
/*
1507+
* No preemption supported therefore also no need to check for
1508+
* different cpus.
1509+
*/
1510+
#define TID_STEP 1
1511+
#endif
1512+
1513+
static inline unsigned long next_tid(unsigned long tid)
1514+
{
1515+
return tid + TID_STEP;
1516+
}
1517+
1518+
static inline unsigned int tid_to_cpu(unsigned long tid)
1519+
{
1520+
return tid % TID_STEP;
1521+
}
1522+
1523+
static inline unsigned long tid_to_event(unsigned long tid)
1524+
{
1525+
return tid / TID_STEP;
1526+
}
1527+
1528+
static inline unsigned int init_tid(int cpu)
1529+
{
1530+
return cpu;
1531+
}
1532+
1533+
static inline void note_cmpxchg_failure(const char *n,
1534+
const struct kmem_cache *s, unsigned long tid)
1535+
{
1536+
#ifdef SLUB_DEBUG_CMPXCHG
1537+
unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1538+
1539+
printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1540+
1541+
#ifdef CONFIG_PREEMPT
1542+
if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1543+
printk("due to cpu change %d -> %d\n",
1544+
tid_to_cpu(tid), tid_to_cpu(actual_tid));
1545+
else
1546+
#endif
1547+
if (tid_to_event(tid) != tid_to_event(actual_tid))
1548+
printk("due to cpu running other code. Event %ld->%ld\n",
1549+
tid_to_event(tid), tid_to_event(actual_tid));
1550+
else
1551+
printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1552+
actual_tid, tid, next_tid(tid));
1553+
#endif
1554+
}
1555+
1556+
#endif
1557+
1558+
void init_kmem_cache_cpus(struct kmem_cache *s)
1559+
{
1560+
#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
1561+
int cpu;
1562+
1563+
for_each_possible_cpu(cpu)
1564+
per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1565+
#endif
1566+
1567+
}
14971568
/*
14981569
* Remove the cpu slab
14991570
*/
@@ -1525,6 +1596,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
15251596
page->inuse--;
15261597
}
15271598
c->page = NULL;
1599+
#ifdef CONFIG_CMPXCHG_LOCAL
1600+
c->tid = next_tid(c->tid);
1601+
#endif
15281602
unfreeze_slab(s, page, tail);
15291603
}
15301604

@@ -1659,6 +1733,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
16591733
{
16601734
void **object;
16611735
struct page *new;
1736+
#ifdef CONFIG_CMPXCHG_LOCAL
1737+
unsigned long flags;
1738+
1739+
local_irq_save(flags);
1740+
#ifdef CONFIG_PREEMPT
1741+
/*
1742+
* We may have been preempted and rescheduled on a different
1743+
* cpu before disabling interrupts. Need to reload cpu area
1744+
* pointer.
1745+
*/
1746+
c = this_cpu_ptr(s->cpu_slab);
1747+
#endif
1748+
#endif
16621749

16631750
/* We handle __GFP_ZERO in the caller */
16641751
gfpflags &= ~__GFP_ZERO;
@@ -1685,6 +1772,10 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
16851772
c->node = page_to_nid(c->page);
16861773
unlock_out:
16871774
slab_unlock(c->page);
1775+
#ifdef CONFIG_CMPXCHG_LOCAL
1776+
c->tid = next_tid(c->tid);
1777+
local_irq_restore(flags);
1778+
#endif
16881779
stat(s, ALLOC_SLOWPATH);
16891780
return object;
16901781

@@ -1746,23 +1837,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
17461837
{
17471838
void **object;
17481839
struct kmem_cache_cpu *c;
1840+
#ifdef CONFIG_CMPXCHG_LOCAL
1841+
unsigned long tid;
1842+
#else
17491843
unsigned long flags;
1844+
#endif
17501845

17511846
if (slab_pre_alloc_hook(s, gfpflags))
17521847
return NULL;
17531848

1849+
#ifndef CONFIG_CMPXCHG_LOCAL
17541850
local_irq_save(flags);
1851+
#else
1852+
redo:
1853+
#endif
1854+
1855+
/*
1856+
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
1857+
* enabled. We may switch back and forth between cpus while
1858+
* reading from one cpu area. That does not matter as long
1859+
* as we end up on the original cpu again when doing the cmpxchg.
1860+
*/
17551861
c = __this_cpu_ptr(s->cpu_slab);
1862+
1863+
#ifdef CONFIG_CMPXCHG_LOCAL
1864+
/*
1865+
* The transaction ids are globally unique per cpu and per operation on
1866+
* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
1867+
* occurs on the right processor and that there was no operation on the
1868+
* linked list in between.
1869+
*/
1870+
tid = c->tid;
1871+
barrier();
1872+
#endif
1873+
17561874
object = c->freelist;
17571875
if (unlikely(!object || !node_match(c, node)))
17581876

17591877
object = __slab_alloc(s, gfpflags, node, addr, c);
17601878

17611879
else {
1880+
#ifdef CONFIG_CMPXCHG_LOCAL
1881+
/*
1882+
* The cmpxchg will only match if there was no additonal
1883+
* operation and if we are on the right processor.
1884+
*
1885+
* The cmpxchg does the following atomically (without lock semantics!)
1886+
* 1. Relocate first pointer to the current per cpu area.
1887+
* 2. Verify that tid and freelist have not been changed
1888+
* 3. If they were not changed replace tid and freelist
1889+
*
1890+
* Since this is without lock semantics the protection is only against
1891+
* code executing on this cpu *not* from access by other cpus.
1892+
*/
1893+
if (unlikely(!this_cpu_cmpxchg_double(
1894+
s->cpu_slab->freelist, s->cpu_slab->tid,
1895+
object, tid,
1896+
get_freepointer(s, object), next_tid(tid)))) {
1897+
1898+
note_cmpxchg_failure("slab_alloc", s, tid);
1899+
goto redo;
1900+
}
1901+
#else
17621902
c->freelist = get_freepointer(s, object);
1903+
#endif
17631904
stat(s, ALLOC_FASTPATH);
17641905
}
1906+
1907+
#ifndef CONFIG_CMPXCHG_LOCAL
17651908
local_irq_restore(flags);
1909+
#endif
17661910

17671911
if (unlikely(gfpflags & __GFP_ZERO) && object)
17681912
memset(object, 0, s->objsize);
@@ -1840,9 +1984,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
18401984
{
18411985
void *prior;
18421986
void **object = (void *)x;
1987+
#ifdef CONFIG_CMPXCHG_LOCAL
1988+
unsigned long flags;
18431989

1844-
stat(s, FREE_SLOWPATH);
1990+
local_irq_save(flags);
1991+
#endif
18451992
slab_lock(page);
1993+
stat(s, FREE_SLOWPATH);
18461994

18471995
if (kmem_cache_debug(s))
18481996
goto debug;
@@ -1872,6 +2020,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
18722020

18732021
out_unlock:
18742022
slab_unlock(page);
2023+
#ifdef CONFIG_CMPXCHG_LOCAL
2024+
local_irq_restore(flags);
2025+
#endif
18752026
return;
18762027

18772028
slab_empty:
@@ -1883,6 +2034,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
18832034
stat(s, FREE_REMOVE_PARTIAL);
18842035
}
18852036
slab_unlock(page);
2037+
#ifdef CONFIG_CMPXCHG_LOCAL
2038+
local_irq_restore(flags);
2039+
#endif
18862040
stat(s, FREE_SLAB);
18872041
discard_slab(s, page);
18882042
return;
@@ -1909,21 +2063,54 @@ static __always_inline void slab_free(struct kmem_cache *s,
19092063
{
19102064
void **object = (void *)x;
19112065
struct kmem_cache_cpu *c;
2066+
#ifdef CONFIG_CMPXCHG_LOCAL
2067+
unsigned long tid;
2068+
#else
19122069
unsigned long flags;
2070+
#endif
19132071

19142072
slab_free_hook(s, x);
19152073

2074+
#ifndef CONFIG_CMPXCHG_LOCAL
19162075
local_irq_save(flags);
2076+
#endif
2077+
2078+
redo:
2079+
/*
2080+
* Determine the currently cpus per cpu slab.
2081+
* The cpu may change afterward. However that does not matter since
2082+
* data is retrieved via this pointer. If we are on the same cpu
2083+
* during the cmpxchg then the free will succedd.
2084+
*/
19172085
c = __this_cpu_ptr(s->cpu_slab);
19182086

2087+
#ifdef CONFIG_CMPXCHG_LOCAL
2088+
tid = c->tid;
2089+
barrier();
2090+
#endif
2091+
19192092
if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
19202093
set_freepointer(s, object, c->freelist);
2094+
2095+
#ifdef CONFIG_CMPXCHG_LOCAL
2096+
if (unlikely(!this_cpu_cmpxchg_double(
2097+
s->cpu_slab->freelist, s->cpu_slab->tid,
2098+
c->freelist, tid,
2099+
object, next_tid(tid)))) {
2100+
2101+
note_cmpxchg_failure("slab_free", s, tid);
2102+
goto redo;
2103+
}
2104+
#else
19212105
c->freelist = object;
2106+
#endif
19222107
stat(s, FREE_FASTPATH);
19232108
} else
19242109
__slab_free(s, page, x, addr);
19252110

2111+
#ifndef CONFIG_CMPXCHG_LOCAL
19262112
local_irq_restore(flags);
2113+
#endif
19272114
}
19282115

19292116
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2115,9 +2302,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
21152302
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
21162303
SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
21172304

2305+
#ifdef CONFIG_CMPXCHG_LOCAL
2306+
/*
2307+
* Must align to double word boundary for the double cmpxchg instructions
2308+
* to work.
2309+
*/
2310+
s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
2311+
#else
2312+
/* Regular alignment is sufficient */
21182313
s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2314+
#endif
2315+
2316+
if (!s->cpu_slab)
2317+
return 0;
2318+
2319+
init_kmem_cache_cpus(s);
21192320

2120-
return s->cpu_slab != NULL;
2321+
return 1;
21212322
}
21222323

21232324
static struct kmem_cache *kmem_cache_node;

0 commit comments

Comments
 (0)