Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 8a03cf0

Browse files
committed
Implement cache index randomization for large allocations.
Extract szad size quantization into {extent,run}_quantize(), and . quantize szad run sizes to the union of valid small region run sizes and large run sizes. Refactor iteration in arena_run_first_fit() to use run_quantize{,_first,_next(), and add support for padded large runs. For large allocations that have no specified alignment constraints, compute a pseudo-random offset from the beginning of the first backing page that is a multiple of the cache line size. Under typical configurations with 4-KiB pages and 64-byte cache lines this results in a uniform distribution among 64 page boundary offsets. Add the --disable-cache-oblivious option, primarily intended for performance testing. This resolves #13.
1 parent 6bb54cb commit 8a03cf0

File tree

10 files changed

+279
-73
lines changed

10 files changed

+279
-73
lines changed

ChangeLog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ found in the git revision history:
101101
run fragmentation, smaller runs reduce external fragmentation for small size
102102
classes, and packed (less uniformly aligned) metadata layout improves CPU
103103
cache set distribution.
104+
- Randomly distribute large allocation base pointer alignment relative to page
105+
boundaries in order to more uniformly utilize CPU cache sets. This can be
106+
disabled via the --disable-cache-oblivious configure option.
104107
- Micro-optimize the fast paths for the public API functions.
105108
- Refactor thread-specific data to reside in a single structure. This assures
106109
that only a single TLS read is necessary per call into the public API.

INSTALL

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,15 @@ any of the following arguments (not a definitive list) to 'configure':
185185
thread-local variables via the __thread keyword. If TLS is available,
186186
jemalloc uses it for several purposes.
187187

188+
--disable-cache-oblivious
189+
Disable cache-oblivious large allocation alignment for large allocation
190+
requests with no alignment constraints. If this feature is disabled, all
191+
large allocations are page-aligned as an implementation artifact, which can
192+
severely harm CPU cache utilization. However, the cache-oblivious layout
193+
comes at the cost of one extra page per large allocation, which in the
194+
most extreme case increases physical memory usage for the 16 KiB size class
195+
to 20 KiB.
196+
188197
--with-xslroot=<path>
189198
Specify where to find DocBook XSL stylesheets when building the
190199
documentation.

configure.ac

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,23 @@ if test "x$enable_xmalloc" = "x1" ; then
952952
fi
953953
AC_SUBST([enable_xmalloc])
954954

955+
dnl Support cache-oblivious allocation alignment by default.
956+
AC_ARG_ENABLE([cache-oblivious],
957+
[AS_HELP_STRING([--disable-cache-oblivious],
958+
[Disable support for cache-oblivious allocation alignment])],
959+
[if test "x$enable_cache_oblivious" = "xno" ; then
960+
enable_cache_oblivious="0"
961+
else
962+
enable_cache_oblivious="1"
963+
fi
964+
],
965+
[enable_cache_oblivious="1"]
966+
)
967+
if test "x$enable_cache_oblivious" = "x1" ; then
968+
AC_DEFINE([JEMALLOC_CACHE_OBLIVIOUS], [ ])
969+
fi
970+
AC_SUBST([enable_cache_oblivious])
971+
955972
dnl ============================================================================
956973
dnl Check for __builtin_ffsl(), then ffsl(3), and fail if neither are found.
957974
dnl One of those two functions should (theoretically) exist on all platforms
@@ -1663,4 +1680,5 @@ AC_MSG_RESULT([xmalloc : ${enable_xmalloc}])
16631680
AC_MSG_RESULT([munmap : ${enable_munmap}])
16641681
AC_MSG_RESULT([lazy_lock : ${enable_lazy_lock}])
16651682
AC_MSG_RESULT([tls : ${enable_tls}])
1683+
AC_MSG_RESULT([cache-oblivious : ${enable_cache_oblivious}])
16661684
AC_MSG_RESULT([===============================================================================])

include/jemalloc/internal/arena.h

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,12 @@ struct arena_s {
290290

291291
uint64_t prof_accumbytes;
292292

293+
/*
294+
* PRNG state for cache index randomization of large allocation base
295+
* pointers.
296+
*/
297+
uint64_t offset_state;
298+
293299
dss_prec_t dss_prec;
294300

295301
/*
@@ -394,7 +400,15 @@ struct arena_s {
394400
/******************************************************************************/
395401
#ifdef JEMALLOC_H_EXTERNS
396402

397-
extern ssize_t opt_lg_dirty_mult;
403+
static const size_t large_pad =
404+
#ifdef JEMALLOC_CACHE_OBLIVIOUS
405+
PAGE
406+
#else
407+
0
408+
#endif
409+
;
410+
411+
extern ssize_t opt_lg_dirty_mult;
398412

399413
extern arena_bin_info_t arena_bin_info[NBINS];
400414

@@ -475,7 +489,7 @@ void arena_stats_merge(arena_t *arena, const char **dss,
475489
arena_stats_t *astats, malloc_bin_stats_t *bstats,
476490
malloc_large_stats_t *lstats, malloc_huge_stats_t *hstats);
477491
arena_t *arena_new(unsigned ind);
478-
void arena_boot(void);
492+
bool arena_boot(void);
479493
void arena_prefork(arena_t *arena);
480494
void arena_postfork_parent(arena_t *arena);
481495
void arena_postfork_child(arena_t *arena);
@@ -721,7 +735,7 @@ arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind, size_t size,
721735
{
722736
size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
723737

724-
assert((size & PAGE_MASK) == 0);
738+
assert(size == PAGE_CEILING(size));
725739
assert((flags & ~CHUNK_MAP_FLAGS_MASK) == 0);
726740
assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == flags);
727741
arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags);
@@ -734,7 +748,7 @@ arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
734748
size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
735749
size_t mapbits = arena_mapbitsp_read(mapbitsp);
736750

737-
assert((size & PAGE_MASK) == 0);
751+
assert(size == PAGE_CEILING(size));
738752
assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
739753
arena_mapbitsp_write(mapbitsp, size | (mapbits & PAGE_MASK));
740754
}
@@ -747,7 +761,7 @@ arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
747761
size_t mapbits = arena_mapbitsp_read(mapbitsp);
748762
size_t unzeroed;
749763

750-
assert((size & PAGE_MASK) == 0);
764+
assert(size == PAGE_CEILING(size));
751765
assert((flags & CHUNK_MAP_DIRTY) == flags);
752766
unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
753767
arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags
@@ -762,7 +776,8 @@ arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
762776
size_t mapbits = arena_mapbitsp_read(mapbitsp);
763777

764778
assert(binind <= BININD_INVALID);
765-
assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS);
779+
assert(arena_mapbits_large_size_get(chunk, pageind) == LARGE_MINCLASS +
780+
large_pad);
766781
arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_BININD_MASK) |
767782
(binind << CHUNK_MAP_BININD_SHIFT));
768783
}
@@ -1107,13 +1122,16 @@ arena_salloc(const void *ptr, bool demote)
11071122
* end up looking at binind to determine that ptr is a
11081123
* small allocation.
11091124
*/
1110-
assert(((uintptr_t)ptr & PAGE_MASK) == 0);
1111-
ret = arena_mapbits_large_size_get(chunk, pageind);
1125+
assert(config_cache_oblivious || ((uintptr_t)ptr &
1126+
PAGE_MASK) == 0);
1127+
ret = arena_mapbits_large_size_get(chunk, pageind) -
1128+
large_pad;
11121129
assert(ret != 0);
1113-
assert(pageind + (ret>>LG_PAGE) <= chunk_npages);
1130+
assert(pageind + ((ret+large_pad)>>LG_PAGE) <=
1131+
chunk_npages);
11141132
assert(arena_mapbits_dirty_get(chunk, pageind) ==
11151133
arena_mapbits_dirty_get(chunk,
1116-
pageind+(ret>>LG_PAGE)-1));
1134+
pageind+((ret+large_pad)>>LG_PAGE)-1));
11171135
} else {
11181136
/*
11191137
* Small allocation (possibly promoted to a large
@@ -1157,11 +1175,13 @@ arena_dalloc(tsd_t *tsd, void *ptr, tcache_t *tcache)
11571175
size_t size = arena_mapbits_large_size_get(chunk,
11581176
pageind);
11591177

1160-
assert(((uintptr_t)ptr & PAGE_MASK) == 0);
1178+
assert(config_cache_oblivious || ((uintptr_t)ptr &
1179+
PAGE_MASK) == 0);
11611180

1162-
if (likely(tcache != NULL) && size <= tcache_maxclass)
1163-
tcache_dalloc_large(tsd, tcache, ptr, size);
1164-
else {
1181+
if (likely(tcache != NULL) && size <= tcache_maxclass) {
1182+
tcache_dalloc_large(tsd, tcache, ptr, size -
1183+
large_pad);
1184+
} else {
11651185
arena_dalloc_large(extent_node_arena_get(
11661186
&chunk->node), chunk, ptr);
11671187
}
@@ -1188,7 +1208,7 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
11881208
*/
11891209
assert(((uintptr_t)ptr & PAGE_MASK) == 0);
11901210
size = arena_mapbits_large_size_get(chunk,
1191-
pageind);
1211+
pageind) - large_pad;
11921212
}
11931213
}
11941214
assert(s2u(size) == s2u(arena_salloc(ptr, false)));
@@ -1205,7 +1225,8 @@ arena_sdalloc(tsd_t *tsd, void *ptr, size_t size, tcache_t *tcache)
12051225
&chunk->node), chunk, ptr, pageind);
12061226
}
12071227
} else {
1208-
assert(((uintptr_t)ptr & PAGE_MASK) == 0);
1228+
assert(config_cache_oblivious || ((uintptr_t)ptr &
1229+
PAGE_MASK) == 0);
12091230

12101231
if (likely(tcache != NULL) && size <= tcache_maxclass)
12111232
tcache_dalloc_large(tsd, tcache, ptr, size);

include/jemalloc/internal/jemalloc_internal.h.in

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,13 @@ static const bool config_ivsalloc =
126126
false
127127
#endif
128128
;
129+
static const bool config_cache_oblivious =
130+
#ifdef JEMALLOC_CACHE_OBLIVIOUS
131+
true
132+
#else
133+
false
134+
#endif
135+
;
129136

130137
#ifdef JEMALLOC_C11ATOMICS
131138
#include <stdatomic.h>

include/jemalloc/internal/jemalloc_internal_defs.h.in

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,12 @@
192192
*/
193193
#undef JEMALLOC_IVSALLOC
194194

195+
/*
196+
* If defined, explicitly attempt to more uniformly distribute large allocation
197+
* pointer alignments across all cache indices.
198+
*/
199+
#undef JEMALLOC_CACHE_OBLIVIOUS
200+
195201
/*
196202
* Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
197203
*/

include/jemalloc/internal/prng.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,22 +26,22 @@
2626
* const uint32_t a, c : See above discussion.
2727
*/
2828
#define prng32(r, lg_range, state, a, c) do { \
29-
assert(lg_range > 0); \
30-
assert(lg_range <= 32); \
29+
assert((lg_range) > 0); \
30+
assert((lg_range) <= 32); \
3131
\
3232
r = (state * (a)) + (c); \
3333
state = r; \
34-
r >>= (32 - lg_range); \
34+
r >>= (32 - (lg_range)); \
3535
} while (false)
3636

3737
/* Same as prng32(), but 64 bits of pseudo-randomness, using uint64_t. */
3838
#define prng64(r, lg_range, state, a, c) do { \
39-
assert(lg_range > 0); \
40-
assert(lg_range <= 64); \
39+
assert((lg_range) > 0); \
40+
assert((lg_range) <= 64); \
4141
\
4242
r = (state * (a)) + (c); \
4343
state = r; \
44-
r >>= (64 - lg_range); \
44+
r >>= (64 - (lg_range)); \
4545
} while (false)
4646

4747
#endif /* JEMALLOC_H_TYPES */

0 commit comments

Comments
 (0)