Skip to content

Commit 53b7b8e

Browse files
committed
add support for async backtraces of Tasks on any thread
1 parent c476d84 commit 53b7b8e

File tree

8 files changed

+352
-274
lines changed

8 files changed

+352
-274
lines changed

src/interpreter.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ extern void JL_GC_ENABLEFRAME(interpreter_state*) JL_NOTSAFEPOINT;
6565
// we define this separately so that we can populate the frame before we add it to the backtrace
6666
// it's recommended to mark the containing function with NOINLINE, though not essential
6767
#define JL_GC_ENABLEFRAME(frame) \
68-
((void**)&frame[1])[0] = __builtin_frame_address(0);
68+
jl_signal_fence(); \
69+
((void**)&frame[1])[0] = __builtin_frame_address(0);
6970

7071
#endif
7172

src/julia_internal.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,8 @@ JL_DLLEXPORT void jl_lock_profile(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
204204
JL_DLLEXPORT void jl_unlock_profile(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
205205
JL_DLLEXPORT void jl_lock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
206206
JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
207+
int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
208+
void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
207209

208210
// number of cycles since power-on
209211
static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
@@ -1163,6 +1165,9 @@ void jl_print_bt_entry_codeloc(jl_bt_element_t *bt_data) JL_NOTSAFEPOINT;
11631165
#ifdef _OS_WINDOWS_
11641166
JL_DLLEXPORT void jl_refresh_dbg_module_list(void);
11651167
#endif
1168+
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx) JL_NOTSAFEPOINT;
1169+
void jl_thread_resume(int tid) JL_NOTSAFEPOINT;
1170+
11661171
// *to is NULL or malloc'd pointer, from is allowed to be NULL
11671172
STATIC_INLINE char *jl_copy_str(char **to, const char *from) JL_NOTSAFEPOINT
11681173
{

src/julia_threads.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ typedef struct {
109109

110110
// handle to reference an OS thread
111111
#ifdef _OS_WINDOWS_
112-
typedef DWORD jl_thread_t;
112+
typedef HANDLE jl_thread_t;
113113
#else
114114
typedef pthread_t jl_thread_t;
115115
#endif

src/signals-mach.c

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -384,12 +384,12 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
384384
HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
385385
}
386386

387-
static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
387+
static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) JL_NOTSAFEPOINT
388388
{
389389
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
390390
if (ptls2 == NULL) // this thread is not alive
391391
return 0;
392-
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
392+
jl_task_t *ct2 = jl_atomic_load_relaxed(&ptls2->current_task);
393393
if (ct2 == NULL) // this thread is already dead
394394
return 0;
395395

@@ -407,18 +407,18 @@ static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
407407
return 1;
408408
}
409409

410-
static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t **ctx)
410+
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
411411
{
412412
(void)timeout;
413-
static host_thread_state_t state;
413+
host_thread_state_t state;
414414
if (!jl_thread_suspend_and_get_state2(tid, &state)) {
415-
*ctx = NULL;
416-
return;
415+
return 0;
417416
}
418-
*ctx = (unw_context_t*)&state;
417+
*ctx = *(unw_context_t*)&state;
418+
return 1;
419419
}
420420

421-
static void jl_thread_resume(int tid, int sig)
421+
void jl_thread_resume(int tid)
422422
{
423423
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
424424
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
@@ -593,8 +593,15 @@ static void jl_unlock_profile_mach(int dlsymlock, int keymgr_locked)
593593
jl_unlock_profile();
594594
}
595595

596-
#define jl_lock_profile() int keymgr_locked = jl_lock_profile_mach(1)
597-
#define jl_unlock_profile() jl_unlock_profile_mach(1, keymgr_locked)
596+
int jl_lock_stackwalk(void)
597+
{
598+
return jl_lock_profile_mach(1);
599+
}
600+
601+
void jl_unlock_stackwalk(int lockret)
602+
{
603+
jl_unlock_profile_mach(1, lockret);
604+
}
598605

599606
void *mach_profile_listener(void *arg)
600607
{
@@ -691,7 +698,7 @@ void *mach_profile_listener(void *arg)
691698
bt_data_prof[bt_size_cur++].uintptr = 0;
692699
}
693700
// We're done! Resume the thread.
694-
jl_thread_resume(i, 0);
701+
jl_thread_resume(i);
695702
}
696703
jl_unlock_profile_mach(0, keymgr_locked);
697704
if (running) {

src/signals-unix.c

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,18 @@ int exc_reg_is_write_fault(uintptr_t esr) {
291291
#include "signals-mach.c"
292292
#else
293293

294+
int jl_lock_stackwalk(void)
295+
{
296+
jl_lock_profile();
297+
return 0;
298+
}
299+
300+
void jl_unlock_stackwalk(int lockret)
301+
{
302+
(void)lockret;
303+
jl_unlock_profile();
304+
}
305+
294306

295307
#if defined(_OS_LINUX_) && (defined(_CPU_X86_64_) || defined(_CPU_X86_))
296308
int is_write_fault(void *context) {
@@ -384,12 +396,12 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
384396
}
385397

386398
#if !defined(JL_DISABLE_LIBUNWIND)
387-
static unw_context_t *signal_context;
399+
static bt_context_t *signal_context;
388400
pthread_mutex_t in_signal_lock;
389401
static pthread_cond_t exit_signal_cond;
390402
static pthread_cond_t signal_caught_cond;
391403

392-
static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t **ctx)
404+
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
393405
{
394406
struct timespec ts;
395407
clock_gettime(CLOCK_REALTIME, &ts);
@@ -399,9 +411,8 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
399411
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
400412
if (ct2 == NULL) {
401413
// this thread is not alive or already dead
402-
*ctx = NULL;
403414
pthread_mutex_unlock(&in_signal_lock);
404-
return;
415+
return 0;
405416
}
406417
jl_atomic_store_release(&ptls2->signal_request, 1);
407418
pthread_kill(ptls2->system_id, SIGUSR2);
@@ -410,9 +421,8 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
410421
if (err == ETIMEDOUT) {
411422
sig_atomic_t request = 1;
412423
if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
413-
*ctx = NULL;
414424
pthread_mutex_unlock(&in_signal_lock);
415-
return;
425+
return 0;
416426
}
417427
// Request is either now 0 (meaning the other thread is waiting for
418428
// exit_signal_cond already),
@@ -429,15 +439,16 @@ static void jl_thread_suspend_and_get_state(int tid, int timeout, unw_context_t
429439
// checking it is 0, and add an acquire barrier for good measure)
430440
int request = jl_atomic_load_acquire(&ptls2->signal_request);
431441
assert(request == 0); (void) request;
432-
*ctx = signal_context;
442+
jl_atomic_store_release(&ptls2->signal_request, 1); // prepare to resume normally
443+
*ctx = *signal_context;
444+
return 1;
433445
}
434446

435-
static void jl_thread_resume(int tid, int sig)
447+
void jl_thread_resume(int tid)
436448
{
437449
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
438-
jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1);
439450
pthread_cond_broadcast(&exit_signal_cond);
440-
pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge
451+
pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge (so that signal_request doesn't get mixed up)
441452
// The other thread is waiting to leave exit_signal_cond (verify that here by
442453
// checking it is 0, and add an acquire barrier for good measure)
443454
int request = jl_atomic_load_acquire(&ptls2->signal_request);
@@ -472,14 +483,14 @@ CFI_NORETURN
472483
static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
473484
{
474485
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
475-
unw_context_t *signal_context;
486+
bt_context_t signal_context;
476487
// This also makes sure `sleep` is aborted.
477-
jl_thread_suspend_and_get_state(0, 30, &signal_context);
478-
if (signal_context != NULL) {
488+
if (jl_thread_suspend_and_get_state(0, 30, &signal_context)) {
479489
thread0_exit_signo = signo;
480490
ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
481491
memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
482-
jl_thread_resume(0, -1); // resume with message 3 (call jl_exit_thread0_cb)
492+
jl_atomic_store_release(&ptls2->signal_request, 3);
493+
jl_thread_resume(0); // resume with message 3 (call jl_exit_thread0_cb)
483494
}
484495
else {
485496
// thread 0 is gone? just do the exit ourself
@@ -840,28 +851,27 @@ static void *signal_listener(void *arg)
840851
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
841852
bt_size = 0;
842853
#if !defined(JL_DISABLE_LIBUNWIND)
843-
unw_context_t *signal_context;
854+
bt_context_t signal_context;
844855
// sample each thread, round-robin style in reverse order
845856
// (so that thread zero gets notified last)
846857
if (critical || profile) {
847-
jl_lock_profile();
858+
int lockret = jl_lock_stackwalk();
848859
int *randperm;
849860
if (profile)
850861
randperm = profile_get_randperm(nthreads);
851862
for (int idx = nthreads; idx-- > 0; ) {
852863
// Stop the threads in the random or reverse round-robin order.
853864
int i = profile ? randperm[idx] : idx;
854865
// notify thread to stop
855-
jl_thread_suspend_and_get_state(i, 1, &signal_context);
856-
if (signal_context == NULL)
866+
if (!jl_thread_suspend_and_get_state(i, 1, &signal_context))
857867
continue;
858868

859869
// do backtrace on thread contexts for critical signals
860870
// this part must be signal-handler safe
861871
if (critical) {
862872
bt_size += rec_backtrace_ctx(bt_data + bt_size,
863873
JL_MAX_BT_SIZE / nthreads - 1,
864-
signal_context, NULL);
874+
&signal_context, NULL);
865875
bt_data[bt_size++].uintptr = 0;
866876
}
867877

@@ -883,7 +893,7 @@ static void *signal_listener(void *arg)
883893
} else {
884894
// Get backtrace data
885895
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
886-
bt_size_max - bt_size_cur - 1, signal_context, NULL);
896+
bt_size_max - bt_size_cur - 1, &signal_context, NULL);
887897
}
888898
jl_set_safe_restore(old_buf);
889899

@@ -908,9 +918,9 @@ static void *signal_listener(void *arg)
908918
}
909919

910920
// notify thread to resume
911-
jl_thread_resume(i, sig);
921+
jl_thread_resume(i);
912922
}
913-
jl_unlock_profile();
923+
jl_unlock_stackwalk(lockret);
914924
}
915925
#ifndef HAVE_MACH
916926
if (profile && running) {

src/signals-win.c

Lines changed: 70 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,54 @@ JL_DLLEXPORT void jl_install_sigint_handler(void)
344344

345345
static volatile HANDLE hBtThread = 0;
346346

347+
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
348+
{
349+
(void)timeout;
350+
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
351+
if (ptls2 == NULL) // this thread is not alive
352+
return 0;
353+
jl_task_t *ct2 = jl_atomic_load_relaxed(&ptls2->current_task);
354+
if (ct2 == NULL) // this thread is already dead
355+
return 0;
356+
HANDLE hThread = ptls2->system_id;
357+
if ((DWORD)-1 == SuspendThread(hThread))
358+
return 0;
359+
assert(sizeof(*ctx) == sizeof(CONTEXT));
360+
memset(ctx, 0, sizeof(CONTEXT));
361+
ctx->ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
362+
if (GetThreadContext(hThread, ctx)) {
363+
if ((DWORD)-1 == ResumeThread(hThread))
364+
abort();
365+
return 0;
366+
}
367+
return 1;
368+
}
369+
370+
void jl_thread_resume(int tid)
371+
{
372+
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
373+
HANDLE hThread = ptls2->system_id;
374+
if ((DWORD)-1 == SuspendThread(hThread)) {
375+
fputs("failed to resume main thread! aborting.", stderr);
376+
abort();
377+
}
378+
}
379+
380+
int jl_lock_stackwalk(void)
381+
{
382+
uv_mutex_lock(&jl_in_stackwalk);
383+
jl_lock_profile();
384+
return 0;
385+
}
386+
387+
void jl_unlock_stackwalk(int lockret)
388+
{
389+
(void)lockret;
390+
jl_unlock_profile();
391+
uv_mutex_unlock(&jl_in_stackwalk);
392+
}
393+
394+
347395
static DWORD WINAPI profile_bt( LPVOID lparam )
348396
{
349397
// Note: illegal to use jl_* functions from this thread except for profiling-specific functions
@@ -357,55 +405,41 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
357405
continue;
358406
}
359407
else {
360-
uv_mutex_lock(&jl_in_stackwalk);
361-
jl_lock_profile();
362-
if ((DWORD)-1 == SuspendThread(hMainThread)) {
363-
fputs("failed to suspend main thread. aborting profiling.", stderr);
364-
break;
365-
}
408+
int lockret = jl_lock_stackwalk();
366409
CONTEXT ctxThread;
367-
memset(&ctxThread, 0, sizeof(CONTEXT));
368-
ctxThread.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
369-
if (!GetThreadContext(hMainThread, &ctxThread)) {
370-
fputs("failed to get context from main thread. aborting profiling.", stderr);
410+
if (!jl_thread_suspend_and_get_state(0, 0, &ctxThread)) {
411+
jl_unlock_stackwalk(lockret);
412+
fputs("failed to suspend main thread. aborting profiling.", stderr);
371413
jl_profile_stop_timer();
414+
break;
372415
}
373-
else {
374-
// Get backtrace data
375-
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
376-
bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
416+
// Get backtrace data
417+
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
418+
bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
377419

378-
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
420+
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
379421

380-
// store threadid but add 1 as 0 is preserved to indicate end of block
381-
bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
422+
// store threadid but add 1 as 0 is preserved to indicate end of block
423+
bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
382424

383-
// store task id (never null)
384-
bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
425+
// store task id (never null)
426+
bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
385427

386-
// store cpu cycle clock
387-
bt_data_prof[bt_size_cur++].uintptr = cycleclock();
428+
// store cpu cycle clock
429+
bt_data_prof[bt_size_cur++].uintptr = cycleclock();
388430

389-
// store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
390-
bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
431+
// store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
432+
bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
391433

392-
// Mark the end of this block with two 0's
393-
bt_data_prof[bt_size_cur++].uintptr = 0;
394-
bt_data_prof[bt_size_cur++].uintptr = 0;
395-
}
396-
jl_unlock_profile();
397-
uv_mutex_unlock(&jl_in_stackwalk);
398-
if ((DWORD)-1 == ResumeThread(hMainThread)) {
399-
jl_profile_stop_timer();
400-
fputs("failed to resume main thread! aborting.", stderr);
401-
jl_gc_debug_critical_error();
402-
abort();
403-
}
434+
// Mark the end of this block with two 0's
435+
bt_data_prof[bt_size_cur++].uintptr = 0;
436+
bt_data_prof[bt_size_cur++].uintptr = 0;
437+
jl_unlock_stackwalk(lockret);
438+
jl_thread_resume(0);
404439
jl_check_profile_autostop();
405440
}
406441
}
407442
}
408-
jl_unlock_profile();
409443
uv_mutex_unlock(&jl_in_stackwalk);
410444
jl_profile_stop_timer();
411445
hBtThread = 0;

0 commit comments

Comments
 (0)