Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 84c8eef

Browse files
author
Jason Evans
committed
Use bitmaps to track small regions.
The previous free list implementation, which embedded singly linked lists in available regions, had the unfortunate side effect of causing many cache misses during thread cache fills. Fix this in two places: - arena_run_t: Use a new bitmap implementation to track which regions are available. Furthermore, revert to preferring the lowest available region (as jemalloc did with its old bitmap-based approach). - tcache_t: Move read-only tcache_bin_t metadata into tcache_bin_info_t, and add a contiguous array of pointers to tcache_t in order to track cached objects. This substantially increases the size of tcache_t, but results in much higher data locality for common tcache operations. As a side benefit, it is again possible to efficiently flush the least recently used cached objects, so this change changes flushing from MRU to LRU. The new bitmap implementation uses a multi-level summary approach to make finding the lowest available region very fast. In practice, bitmaps only have one or two levels, though the implementation is general enough to handle extremely large bitmaps, mainly so that large page sizes can still be entertained. Fix tcache_bin_flush_large() to always flush statistics, in the same way that tcache_bin_flush_small() was recently fixed. Use JEMALLOC_DEBUG rather than NDEBUG. Add dassert(), and use it for debug-only asserts.
1 parent 77f350b commit 84c8eef

File tree

15 files changed

+702
-139
lines changed

15 files changed

+702
-139
lines changed

jemalloc/Makefile.in

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ BINS := @srcroot@bin/pprof
4646
CHDRS := @objroot@include/jemalloc/jemalloc@[email protected] \
4747
@objroot@include/jemalloc/jemalloc_defs@[email protected]
4848
CSRCS := @srcroot@src/jemalloc.c @srcroot@src/arena.c @srcroot@src/base.c \
49-
@srcroot@src/chunk.c @srcroot@src/chunk_dss.c \
49+
@srcroot@src/bitmap.c @srcroot@src/chunk.c @srcroot@src/chunk_dss.c \
5050
@srcroot@src/chunk_mmap.c @srcroot@src/chunk_swap.c @srcroot@src/ckh.c \
5151
@srcroot@src/ctl.c @srcroot@src/extent.c @srcroot@src/hash.c \
5252
@srcroot@src/huge.c @srcroot@src/mb.c @srcroot@src/mutex.c \
@@ -65,8 +65,9 @@ DOCS_HTML := $(DOCS_XML:@objroot@%.xml=@srcroot@%.html)
6565
DOCS_MAN3 := $(DOCS_XML:@objroot@%.xml=@srcroot@%.3)
6666
DOCS := $(DOCS_HTML) $(DOCS_MAN3)
6767
CTESTS := @srcroot@test/allocated.c @srcroot@test/allocm.c \
68-
@srcroot@test/mremap.c @srcroot@test/posix_memalign.c \
69-
@srcroot@test/rallocm.c @srcroot@test/thread_arena.c
68+
@srcroot@test/bitmap.c @srcroot@test/mremap.c \
69+
@srcroot@test/posix_memalign.c @srcroot@test/rallocm.c \
70+
@srcroot@test/thread_arena.c
7071

7172
.PHONY: all dist doc_html doc_man doc
7273
.PHONY: install_bin install_include install_lib
@@ -127,6 +128,9 @@ doc: $(DOCS)
127128
$(CC) $(CFLAGS) -c $(CPPFLAGS) -I@objroot@test -o $@ $<
128129
@$(SHELL) -ec "$(CC) -MM $(CPPFLAGS) -I@objroot@test $< | sed \"s/\($(subst /,\/,$(notdir $(basename $@)))\)\.o\([ :]*\)/$(subst /,\/,$(strip $(dir $@)))\1.o \2/g\" > $(@:%.o=%.d)"
129130

131+
# Automatic dependency generation misses #include "*.c".
132+
@objroot@test/bitmap.o : @objroot@src/bitmap.o
133+
130134
@objroot@test/%: @objroot@test/%.o \
131135
@objroot@lib/libjemalloc@install_suffix@.$(SO)
132136
@mkdir -p $(@D)

jemalloc/configure.ac

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,16 @@ else
132132
fi
133133
AC_DEFINE_UNQUOTED([LG_SIZEOF_INT], [$LG_SIZEOF_INT])
134134

135+
AC_CHECK_SIZEOF([long])
136+
if test "x${ac_cv_sizeof_long}" = "x8" ; then
137+
LG_SIZEOF_LONG=3
138+
elif test "x${ac_cv_sizeof_long}" = "x4" ; then
139+
LG_SIZEOF_LONG=2
140+
else
141+
AC_MSG_ERROR([Unsupported long size: ${ac_cv_sizeof_long}])
142+
fi
143+
AC_DEFINE_UNQUOTED([LG_SIZEOF_LONG], [$LG_SIZEOF_LONG])
144+
135145
AC_CANONICAL_HOST
136146
dnl CPU-specific settings.
137147
CPU_SPINWAIT=""
@@ -752,6 +762,14 @@ if test "x${enable_tls}" = "x0" ; then
752762
AC_DEFINE_UNQUOTED([NO_TLS], [ ])
753763
fi
754764

765+
dnl ============================================================================
766+
dnl Check for ffsl(3), and fail if not found. This function exists on all
767+
dnl platforms that jemalloc currently has a chance of functioning on without
768+
dnl modification.
769+
770+
AC_CHECK_FUNC([ffsl], [],
771+
[AC_MSG_ERROR([Cannot build without ffsl(3)])])
772+
755773
dnl ============================================================================
756774
dnl Check for allocator-related functions that should be wrapped.
757775

jemalloc/include/jemalloc/internal/arena.h

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -209,18 +209,15 @@ struct arena_run_s {
209209
/* Bin this run is associated with. */
210210
arena_bin_t *bin;
211211

212-
/* Stack of available freed regions, or NULL. */
213-
void *avail;
214-
215-
/* Next region that has never been allocated, or run boundary. */
216-
void *next;
212+
/* Index of next region that has never been allocated, or nregs. */
213+
uint32_t nextind;
217214

218215
/* Number of free regions in run. */
219216
unsigned nfree;
220217
};
221218

222219
/*
223-
* Read-only information associated with each element for arena_t's bins array
220+
* Read-only information associated with each element of arena_t's bins array
224221
* is stored separately, partly to reduce memory usage (only one copy, rather
225222
* than one per arena), but mainly to avoid false cacheline sharing.
226223
*/
@@ -234,6 +231,18 @@ struct arena_bin_info_s {
234231
/* Total number of regions in a run for this bin's size class. */
235232
uint32_t nregs;
236233

234+
/*
235+
* Offset of first bitmap_t element in a run header for this bin's size
236+
* class.
237+
*/
238+
uint32_t bitmap_offset;
239+
240+
/*
241+
* Metadata used to manipulate bitmaps for runs associated with this
242+
* bin.
243+
*/
244+
bitmap_info_t bitmap_info;
245+
237246
#ifdef JEMALLOC_PROF
238247
/*
239248
* Offset of first (prof_ctx_t *) in a run header for this bin's size
@@ -397,7 +406,7 @@ struct arena_s {
397406

398407
extern size_t opt_lg_qspace_max;
399408
extern size_t opt_lg_cspace_max;
400-
extern ssize_t opt_lg_dirty_mult;
409+
extern ssize_t opt_lg_dirty_mult;
401410
/*
402411
* small_size2bin is a compact lookup table that rounds request sizes up to
403412
* size classes. In order to reduce cache footprint, the table is compressed,
@@ -498,7 +507,13 @@ arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info, const void *ptr)
498507
unsigned shift, diff, regind;
499508
size_t size;
500509

501-
assert(run->magic == ARENA_RUN_MAGIC);
510+
dassert(run->magic == ARENA_RUN_MAGIC);
511+
/*
512+
* Freeing a pointer lower than region zero can cause assertion
513+
* failure.
514+
*/
515+
assert((uintptr_t)ptr >= (uintptr_t)run +
516+
(uintptr_t)bin_info->reg0_offset);
502517

503518
/*
504519
* Avoid doing division with a variable divisor if possible. Using
@@ -583,7 +598,7 @@ arena_prof_ctx_get(const void *ptr)
583598
arena_bin_info_t *bin_info = &arena_bin_info[binind];
584599
unsigned regind;
585600

586-
assert(run->magic == ARENA_RUN_MAGIC);
601+
dassert(run->magic == ARENA_RUN_MAGIC);
587602
regind = arena_run_regind(run, bin_info, ptr);
588603
ret = *(prof_ctx_t **)((uintptr_t)run +
589604
bin_info->ctx0_offset + (regind *
@@ -618,7 +633,7 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
618633
arena_bin_info_t *bin_info;
619634
unsigned regind;
620635

621-
assert(run->magic == ARENA_RUN_MAGIC);
636+
dassert(run->magic == ARENA_RUN_MAGIC);
622637
binind = arena_bin_index(chunk->arena, bin);
623638
bin_info = &arena_bin_info[binind];
624639
regind = arena_run_regind(run, bin_info, ptr);
@@ -639,7 +654,7 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
639654
arena_chunk_map_t *mapelm;
640655

641656
assert(arena != NULL);
642-
assert(arena->magic == ARENA_MAGIC);
657+
dassert(arena->magic == ARENA_MAGIC);
643658
assert(chunk->arena == arena);
644659
assert(ptr != NULL);
645660
assert(CHUNK_ADDR2BASE(ptr) != ptr);
@@ -662,9 +677,9 @@ arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr)
662677
run = (arena_run_t *)((uintptr_t)chunk +
663678
(uintptr_t)((pageind - (mapelm->bits >>
664679
PAGE_SHIFT)) << PAGE_SHIFT));
665-
assert(run->magic == ARENA_RUN_MAGIC);
680+
dassert(run->magic == ARENA_RUN_MAGIC);
666681
bin = run->bin;
667-
#ifndef NDEBUG
682+
#ifdef JEMALLOC_DEBUG
668683
{
669684
size_t binind = arena_bin_index(arena, bin);
670685
arena_bin_info_t *bin_info =
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
/******************************************************************************/
2+
#ifdef JEMALLOC_H_TYPES
3+
4+
/* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
5+
#define LG_BITMAP_MAXBITS 18
6+
7+
typedef struct bitmap_level_s bitmap_level_t;
8+
typedef struct bitmap_info_s bitmap_info_t;
9+
typedef unsigned long bitmap_t;
10+
#define LG_SIZEOF_BITMAP LG_SIZEOF_LONG
11+
12+
/* Number of bits per group. */
13+
#define LG_BITMAP_GROUP_NBITS (LG_SIZEOF_BITMAP + 3)
14+
#define BITMAP_GROUP_NBITS (ZU(1) << LG_BITMAP_GROUP_NBITS)
15+
#define BITMAP_GROUP_NBITS_MASK (BITMAP_GROUP_NBITS-1)
16+
17+
/* Maximum number of levels possible. */
18+
#define BITMAP_MAX_LEVELS \
19+
(LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP) \
20+
+ !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP)
21+
22+
#endif /* JEMALLOC_H_TYPES */
23+
/******************************************************************************/
24+
#ifdef JEMALLOC_H_STRUCTS
25+
26+
struct bitmap_level_s {
27+
/* Offset of this level's groups within the array of groups. */
28+
size_t group_offset;
29+
};
30+
31+
struct bitmap_info_s {
32+
/* Logical number of bits in bitmap (stored at bottom level). */
33+
size_t nbits;
34+
35+
/* Number of levels necessary for nbits. */
36+
unsigned nlevels;
37+
38+
/*
39+
* Only the first (nlevels+1) elements are used, and levels are ordered
40+
* bottom to top (e.g. the bottom level is stored in levels[0]).
41+
*/
42+
bitmap_level_t levels[BITMAP_MAX_LEVELS+1];
43+
};
44+
45+
#endif /* JEMALLOC_H_STRUCTS */
46+
/******************************************************************************/
47+
#ifdef JEMALLOC_H_EXTERNS
48+
49+
void bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
50+
size_t bitmap_info_ngroups(const bitmap_info_t *binfo);
51+
size_t bitmap_size(size_t nbits);
52+
void bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo);
53+
54+
#endif /* JEMALLOC_H_EXTERNS */
55+
/******************************************************************************/
56+
#ifdef JEMALLOC_H_INLINES
57+
58+
#ifndef JEMALLOC_ENABLE_INLINE
59+
bool bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo);
60+
bool bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
61+
void bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
62+
size_t bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo);
63+
void bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
64+
#endif
65+
66+
#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_BITMAP_C_))
67+
JEMALLOC_INLINE bool
68+
bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo)
69+
{
70+
unsigned rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
71+
bitmap_t rg = bitmap[rgoff];
72+
/* The bitmap is full iff the root group is 0. */
73+
return (rg == 0);
74+
}
75+
76+
JEMALLOC_INLINE bool
77+
bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
78+
{
79+
size_t goff;
80+
bitmap_t g;
81+
82+
assert(bit < binfo->nbits);
83+
goff = bit >> LG_BITMAP_GROUP_NBITS;
84+
g = bitmap[goff];
85+
return (!(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))));
86+
}
87+
88+
JEMALLOC_INLINE void
89+
bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
90+
{
91+
size_t goff;
92+
bitmap_t *gp;
93+
bitmap_t g;
94+
95+
assert(bit < binfo->nbits);
96+
assert(bitmap_get(bitmap, binfo, bit) == false);
97+
goff = bit >> LG_BITMAP_GROUP_NBITS;
98+
gp = &bitmap[goff];
99+
g = *gp;
100+
assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)));
101+
g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
102+
*gp = g;
103+
assert(bitmap_get(bitmap, binfo, bit));
104+
/* Propagate group state transitions up the tree. */
105+
if (g == 0) {
106+
unsigned i;
107+
for (i = 1; i < binfo->nlevels; i++) {
108+
bit = goff;
109+
goff = bit >> LG_BITMAP_GROUP_NBITS;
110+
gp = &bitmap[binfo->levels[i].group_offset + goff];
111+
g = *gp;
112+
assert(g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)));
113+
g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
114+
*gp = g;
115+
if (g != 0)
116+
break;
117+
}
118+
}
119+
}
120+
121+
/* sfu: set first unset. */
122+
JEMALLOC_INLINE size_t
123+
bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
124+
{
125+
size_t bit;
126+
bitmap_t g;
127+
unsigned i;
128+
129+
assert(bitmap_full(bitmap, binfo) == false);
130+
131+
i = binfo->nlevels - 1;
132+
g = bitmap[binfo->levels[i].group_offset];
133+
bit = ffsl(g) - 1;
134+
while (i > 0) {
135+
i--;
136+
g = bitmap[binfo->levels[i].group_offset + bit];
137+
bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffsl(g) - 1);
138+
}
139+
140+
bitmap_set(bitmap, binfo, bit);
141+
return (bit);
142+
}
143+
144+
JEMALLOC_INLINE void
145+
bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
146+
{
147+
size_t goff;
148+
bitmap_t *gp;
149+
bitmap_t g;
150+
bool propagate;
151+
152+
assert(bit < binfo->nbits);
153+
assert(bitmap_get(bitmap, binfo, bit));
154+
goff = bit >> LG_BITMAP_GROUP_NBITS;
155+
gp = &bitmap[goff];
156+
g = *gp;
157+
propagate = (g == 0);
158+
assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
159+
g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
160+
*gp = g;
161+
assert(bitmap_get(bitmap, binfo, bit) == false);
162+
/* Propagate group state transitions up the tree. */
163+
if (propagate) {
164+
unsigned i;
165+
for (i = 1; i < binfo->nlevels; i++) {
166+
bit = goff;
167+
goff = bit >> LG_BITMAP_GROUP_NBITS;
168+
gp = &bitmap[binfo->levels[i].group_offset + goff];
169+
g = *gp;
170+
propagate = (g == 0);
171+
assert((g & (1LU << (bit & BITMAP_GROUP_NBITS_MASK)))
172+
== 0);
173+
g ^= 1LU << (bit & BITMAP_GROUP_NBITS_MASK);
174+
*gp = g;
175+
if (propagate == false)
176+
break;
177+
}
178+
}
179+
}
180+
181+
#endif
182+
183+
#endif /* JEMALLOC_H_INLINES */
184+
/******************************************************************************/

0 commit comments

Comments
 (0)