Skip to content

Commit 06f5fb4

Browse files
gbaraldiKristofferC
authored andcommitted
Use native tls model in macos for better performance (#55576)
Macos has a native tls implementation in clang since at least clang 3.7 which much older than what we require so lets enable it for some small performance gains. We may want to turn on the ifunc optimization that's there as well but I haven't tested it and it's only in clang 18 and up so it would be off for most since Apple clang is 16 on their latest beta llvm/llvm-project#73687
1 parent cf9c494 commit 06f5fb4

File tree

6 files changed

+18
-55
lines changed

6 files changed

+18
-55
lines changed

cli/loader_lib.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,7 @@ __attribute__((constructor)) void jl_load_libjulia_internal(void) {
546546
(*jl_codegen_exported_func_addrs[symbol_idx]) = addr;
547547
}
548548
// Next, if we're on Linux/FreeBSD, set up fast TLS.
549-
#if !defined(_OS_WINDOWS_) && !defined(_OS_DARWIN_) && !defined(_OS_OPENBSD_)
549+
#if !defined(_OS_WINDOWS_) && !defined(_OS_OPENBSD_)
550550
void (*jl_pgcstack_setkey)(void*, void*(*)(void)) = lookup_symbol(libjulia_internal, "jl_pgcstack_setkey");
551551
if (jl_pgcstack_setkey == NULL) {
552552
jl_loader_print_stderr("ERROR: Cannot find jl_pgcstack_setkey() function within libjulia-internal!\n");

src/gc-stacks.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ static unsigned select_pool(size_t nb) JL_NOTSAFEPOINT
160160
}
161161

162162

163-
static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
163+
void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
164164
{
165165
#ifdef _COMPILER_ASAN_ENABLED_
166166
__asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);

src/julia_fasttls.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,9 @@ extern "C" {
2222

2323
typedef struct _jl_gcframe_t jl_gcframe_t;
2424

25-
#if defined(_OS_DARWIN_)
26-
#include <pthread.h>
27-
typedef void *(jl_get_pgcstack_func)(pthread_key_t); // aka typeof(pthread_getspecific)
28-
#else
2925
typedef jl_gcframe_t **(jl_get_pgcstack_func)(void);
30-
#endif
3126

32-
#if !defined(_OS_DARWIN_) && !defined(_OS_WINDOWS_)
27+
#if !defined(_OS_WINDOWS_)
3328
#define JULIA_DEFINE_FAST_TLS \
3429
static __attribute__((tls_model("local-exec"))) __thread jl_gcframe_t **jl_pgcstack_localexec; \
3530
JL_DLLEXPORT _Atomic(char) jl_pgcstack_static_semaphore; \

src/julia_internal.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,9 +1006,7 @@ int jl_safepoint_consume_sigint(void);
10061006
void jl_wake_libuv(void) JL_NOTSAFEPOINT;
10071007

10081008
void jl_set_pgcstack(jl_gcframe_t **) JL_NOTSAFEPOINT;
1009-
#if defined(_OS_DARWIN_)
1010-
typedef pthread_key_t jl_pgcstack_key_t;
1011-
#elif defined(_OS_WINDOWS_)
1009+
#if defined(_OS_WINDOWS_)
10121010
typedef DWORD jl_pgcstack_key_t;
10131011
#else
10141012
typedef jl_gcframe_t ***(*jl_pgcstack_key_t)(void) JL_NOTSAFEPOINT;

src/scheduler.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -253,10 +253,7 @@ static void wake_libuv(void) JL_NOTSAFEPOINT
253253
JULIA_DEBUG_SLEEPWAKE( io_wakeup_leave = cycleclock() );
254254
}
255255

256-
/* ensure thread tid is awake if necessary */
257-
JL_DLLEXPORT void jl_wakeup_thread(int16_t tid) JL_NOTSAFEPOINT
258-
{
259-
jl_task_t *ct = jl_current_task;
256+
void wakeup_thread(jl_task_t *ct, int16_t tid) JL_NOTSAFEPOINT { // Pass in ptls when we have it already available to save a lookup
260257
int16_t self = jl_atomic_load_relaxed(&ct->tid);
261258
if (tid != self)
262259
jl_fence(); // [^store_buffering_1]
@@ -311,6 +308,12 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid) JL_NOTSAFEPOINT
311308
JULIA_DEBUG_SLEEPWAKE( wakeup_leave = cycleclock() );
312309
}
313310

311+
/* ensure thread tid is awake if necessary */
312+
JL_DLLEXPORT void jl_wakeup_thread(int16_t tid) JL_NOTSAFEPOINT
313+
{
314+
jl_task_t *ct = jl_current_task;
315+
wakeup_thread(ct, tid);
316+
}
314317

315318
// get the next runnable task
316319
static jl_task_t *get_next_task(jl_value_t *trypoptask, jl_value_t *q)
@@ -447,7 +450,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
447450
// responsibility, so need to make sure thread 0 will take care
448451
// of us.
449452
if (jl_atomic_load_relaxed(&jl_uv_mutex.owner) == NULL) // aka trylock
450-
jl_wakeup_thread(0);
453+
wakeup_thread(ct, 0);
451454
}
452455
if (uvlock) {
453456
int enter_eventloop = may_sleep(ptls);
@@ -575,7 +578,7 @@ void scheduler_delete_thread(jl_ptls_t ptls) JL_NOTSAFEPOINT
575578
else {
576579
jl_atomic_fetch_add_relaxed(&n_threads_running, 1);
577580
}
578-
jl_wakeup_thread(0); // force thread 0 to see that we do not have the IO lock (and am dead)
581+
wakeup_thread(jl_atomic_load_relaxed(&ptls->current_task), 0); // force thread 0 to see that we do not have the IO lock (and am dead)
579582
jl_atomic_fetch_add_relaxed(&n_threads_running, -1);
580583
}
581584

src/threading.c

Lines changed: 5 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -82,51 +82,17 @@ JL_DLLEXPORT void jl_set_safe_restore(jl_jmp_buf *sr)
8282
// The tls_states buffer:
8383
//
8484
// On platforms that do not use ELF (i.e. where `__thread` is emulated with
85-
// lower level API) (Mac, Windows), we use the platform runtime API to create
85+
// lower level API) (Windows), we use the platform runtime API to create
8686
// TLS variable directly.
8787
// This is functionally equivalent to using `__thread` but can be
8888
// more efficient since we can have better control over the creation and
8989
// initialization of the TLS buffer.
9090
//
91-
// On platforms that use ELF (Linux, FreeBSD), we use a `__thread` variable
91+
// On platforms that support native TLS (ELF platforms + Macos) we use a `__thread` variable
9292
// as the fallback in the shared object. For better efficiency, we also
9393
// create a `__thread` variable in the main executable using a static TLS
9494
// model.
95-
#if defined(_OS_DARWIN_)
96-
// Mac doesn't seem to have static TLS model so the runtime TLS getter
97-
// registration will only add overhead to TLS access. The `__thread` variables
98-
// are emulated with `pthread_key_t` so it is actually faster to use it directly.
99-
static pthread_key_t jl_pgcstack_key;
100-
101-
__attribute__((constructor)) void jl_init_tls(void)
102-
{
103-
pthread_key_create(&jl_pgcstack_key, NULL);
104-
}
105-
106-
JL_CONST_FUNC jl_gcframe_t **jl_get_pgcstack(void) JL_NOTSAFEPOINT
107-
{
108-
return (jl_gcframe_t**)pthread_getspecific(jl_pgcstack_key);
109-
}
110-
111-
void jl_set_pgcstack(jl_gcframe_t **pgcstack) JL_NOTSAFEPOINT
112-
{
113-
pthread_setspecific(jl_pgcstack_key, (void*)pgcstack);
114-
}
115-
116-
void jl_pgcstack_getkey(jl_get_pgcstack_func **f, pthread_key_t *k)
117-
{
118-
// for codegen
119-
*f = pthread_getspecific;
120-
*k = jl_pgcstack_key;
121-
}
122-
123-
124-
JL_DLLEXPORT void jl_pgcstack_setkey(jl_get_pgcstack_func *f, pthread_key_t k)
125-
{
126-
jl_safe_printf("ERROR: Attempt to change TLS address.\n");
127-
}
128-
129-
#elif defined(_OS_WINDOWS_)
95+
#if defined(_OS_WINDOWS_)
13096
// Apparently windows doesn't have a static TLS model (or one that can be
13197
// reliably used from a shared library) either..... Use `TLSAlloc` instead.
13298

@@ -464,6 +430,7 @@ void jl_safepoint_resume_all_threads(jl_task_t *ct)
464430

465431
void jl_task_frame_noreturn(jl_task_t *ct) JL_NOTSAFEPOINT;
466432
void scheduler_delete_thread(jl_ptls_t ptls) JL_NOTSAFEPOINT;
433+
void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
467434

468435
static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
469436
{
@@ -492,7 +459,7 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
492459
}
493460
if (signal_stack != NULL) {
494461
if (signal_stack_size)
495-
jl_free_stack(signal_stack, signal_stack_size);
462+
_jl_free_stack(ptls ,signal_stack, signal_stack_size);
496463
else
497464
free(signal_stack);
498465
}

0 commit comments

Comments
 (0)