Skip to content

Commit cf862e4

Browse files
committed
implement spin master
1 parent 28d0e1e commit cf862e4

File tree

4 files changed

+175
-65
lines changed

4 files changed

+175
-65
lines changed

src/gc.c

Lines changed: 170 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,12 @@ extern "C" {
1616
int jl_n_markthreads;
1717
// Number of GC threads that may run concurrent sweeping (0 or 1)
1818
int jl_n_sweepthreads;
19-
// Number of threads currently running the GC mark-loop
20-
_Atomic(int) gc_n_threads_marking;
2119
// `tid` of mutator thread that triggered GC
22-
_Atomic(int) gc_master_tid;
20+
_Atomic(int) gc_mutator_aux_tid;
2321
// `tid` of first GC thread
2422
int gc_first_tid;
25-
// Mutex/cond used to synchronize wakeup of GC threads on parallel marking
26-
uv_mutex_t gc_threads_lock;
27-
uv_cond_t gc_threads_cond;
23+
// Number of threads running the GC mark-loop
24+
_Atomic(int) gc_n_threads_marking;
2825
// To indicate whether concurrent sweeping should run
2926
uv_sem_t gc_sweep_assists_needed;
3027

@@ -2745,10 +2742,10 @@ JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls)
27452742
void gc_mark_and_steal(jl_ptls_t ptls)
27462743
{
27472744
jl_gc_markqueue_t *mq = &ptls->mark_queue;
2748-
jl_gc_markqueue_t *mq_master = NULL;
2749-
int master_tid = jl_atomic_load(&gc_master_tid);
2750-
if (master_tid != -1)
2751-
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
2745+
jl_gc_markqueue_t *mq_mutator_aux = NULL;
2746+
int mutator_aux_tid = jl_atomic_load(&gc_mutator_aux_tid);
2747+
if (mutator_aux_tid != -1)
2748+
mq_mutator_aux = &gc_all_tls_states[mutator_aux_tid]->mark_queue;
27522749
void *new_obj;
27532750
jl_gc_chunk_t c;
27542751
pop : {
@@ -2792,8 +2789,8 @@ void gc_mark_and_steal(jl_ptls_t ptls)
27922789
}
27932790
}
27942791
// Try to steal chunk from master thread
2795-
if (mq_master != NULL) {
2796-
c = gc_chunkqueue_steal_from(mq_master);
2792+
if (mq_mutator_aux != NULL) {
2793+
c = gc_chunkqueue_steal_from(mq_mutator_aux);
27972794
if (c.cid != GC_empty_chunk) {
27982795
gc_mark_chunk(ptls, mq, &c);
27992796
goto pop;
@@ -2815,31 +2812,143 @@ void gc_mark_and_steal(jl_ptls_t ptls)
28152812
goto mark;
28162813
}
28172814
// Try to steal pointer from master thread
2818-
if (mq_master != NULL) {
2819-
new_obj = gc_ptr_queue_steal_from(mq_master);
2815+
if (mq_mutator_aux != NULL) {
2816+
new_obj = gc_ptr_queue_steal_from(mq_mutator_aux);
28202817
if (new_obj != NULL)
28212818
goto mark;
28222819
}
28232820
}
28242821
}
28252822

2826-
void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
2823+
#define GC_PTR_MARK_WORK (1)
2824+
#define GC_CHUNK_MARK_WORK (1 << 10)
2825+
#define GC_MARK_WORK_TO_N_THREADS (1 << 3)
2826+
2827+
int64_t gc_estimate_mark_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT
28272828
{
2828-
int backoff = GC_BACKOFF_MIN;
2829-
if (master) {
2830-
jl_atomic_store(&gc_master_tid, ptls->tid);
2831-
// Wake threads up and try to do some work
2832-
uv_mutex_lock(&gc_threads_lock);
2833-
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2834-
uv_cond_broadcast(&gc_threads_cond);
2835-
uv_mutex_unlock(&gc_threads_lock);
2829+
int64_t work = 0;
2830+
work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) -
2831+
jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top)) * GC_PTR_MARK_WORK;
2832+
work += (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) -
2833+
jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top)) * GC_CHUNK_MARK_WORK;
2834+
return work;
2835+
}
2836+
2837+
int64_t gc_estimate_mark_work(void) JL_NOTSAFEPOINT
2838+
{
2839+
int64_t work = 0;
2840+
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
2841+
jl_ptls_t ptls2 = gc_all_tls_states[i];
2842+
work += gc_estimate_mark_work_in_queue(ptls2);
2843+
}
2844+
int mutator_aux_tid = jl_atomic_load(&gc_mutator_aux_tid);
2845+
if (mutator_aux_tid != -1) {
2846+
jl_ptls_t ptls2 = gc_all_tls_states[mutator_aux_tid];
2847+
work += gc_estimate_mark_work_in_queue(ptls2);
2848+
}
2849+
return work;
2850+
}
2851+
2852+
int64_t gc_n_threads_marking_ub(void)
2853+
{
2854+
int64_t n_threads_marking_ub = 0;
2855+
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
2856+
jl_ptls_t ptls2 = gc_all_tls_states[i];
2857+
if (jl_atomic_load(&ptls2->gc_state) == JL_GC_STATE_PARALLEL) {
2858+
n_threads_marking_ub++;
2859+
}
2860+
}
2861+
return n_threads_marking_ub;
2862+
}
2863+
2864+
void gc_wake_mark_thread(jl_ptls_t ptls2)
2865+
{
2866+
uv_mutex_lock(&ptls2->sleep_lock);
2867+
jl_atomic_store(&ptls2->gc_state, JL_GC_STATE_PARALLEL);
2868+
uv_cond_signal(&ptls2->wake_signal);
2869+
uv_mutex_unlock(&ptls2->sleep_lock);
2870+
}
2871+
2872+
// Spin master scheduler: based on Hassenein's
2873+
// `Understanding and Improving JVM GC Work Stealing at the Data Center Scale`
2874+
void gc_spin_master_sched(void)
2875+
{
2876+
while (1) {
2877+
int64_t work = gc_estimate_mark_work();
2878+
int64_t n_threads_marking_ub = gc_n_threads_marking_ub();
2879+
// parallel marking should terminate
2880+
if (work == 0 && n_threads_marking_ub == 0) {
2881+
return;
2882+
}
2883+
// too much work for too few threads
2884+
if (work >= n_threads_marking_ub * GC_MARK_WORK_TO_N_THREADS) {
2885+
int64_t n_threads_marking_ideal = work / GC_MARK_WORK_TO_N_THREADS;
2886+
// try to convert GC threads to workers
2887+
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
2888+
jl_ptls_t ptls2 = gc_all_tls_states[i];
2889+
if (jl_atomic_load(&ptls2->gc_state) == JL_GC_STATE_WAITING) {
2890+
gc_wake_mark_thread(ptls2);
2891+
n_threads_marking_ub++;
2892+
if (n_threads_marking_ub >= n_threads_marking_ideal) {
2893+
break;
2894+
}
2895+
}
2896+
}
2897+
}
2898+
jl_cpu_pause();
2899+
}
2900+
}
2901+
2902+
#define GC_BACKOFF_MIN 4
2903+
#define GC_BACKOFF_MAX 12
2904+
2905+
STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
2906+
{
2907+
if (*i < GC_BACKOFF_MAX) {
2908+
(*i)++;
2909+
}
2910+
for (int j = 0; j < (1 << *i); j++) {
2911+
jl_cpu_pause();
2912+
}
2913+
}
2914+
2915+
void gc_exp_backoff_sched(jl_ptls_t ptls)
2916+
{
2917+
// Wake threads up and try to do some work
2918+
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2919+
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
2920+
jl_ptls_t ptls2 = gc_all_tls_states[i];
2921+
gc_wake_mark_thread(ptls2);
2922+
}
2923+
gc_mark_and_steal(ptls);
2924+
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
2925+
}
2926+
2927+
STATIC_INLINE int gc_may_mark(jl_ptls_t ptls)
2928+
{
2929+
return (jl_atomic_load(&ptls->gc_state) == JL_GC_STATE_PARALLEL) || (jl_atomic_load(&gc_n_threads_marking) > 0);
2930+
}
2931+
2932+
void gc_mark_loop_worker_spin_master(jl_ptls_t ptls)
2933+
{
2934+
while (1) {
2935+
uv_mutex_lock(&ptls->sleep_lock);
2936+
while (!gc_may_mark(ptls)) {
2937+
uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock);
2938+
}
2939+
uv_mutex_unlock(&ptls->sleep_lock);
28362940
gc_mark_and_steal(ptls);
2837-
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
2941+
jl_atomic_store(&ptls->gc_state, JL_GC_STATE_WAITING);
28382942
}
2943+
}
2944+
2945+
void gc_mark_loop_worker_exp_backoff(jl_ptls_t ptls)
2946+
{
2947+
int backoff = GC_BACKOFF_MIN;
28392948
while (jl_atomic_load(&gc_n_threads_marking) > 0) {
28402949
// Try to become a thief while other threads are marking
28412950
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2842-
if (jl_atomic_load(&gc_master_tid) != -1) {
2951+
if (jl_atomic_load(&gc_mutator_aux_tid) != -1) {
28432952
gc_mark_and_steal(ptls);
28442953
}
28452954
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
@@ -2848,21 +2957,48 @@ void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
28482957
}
28492958
}
28502959

2851-
void gc_mark_loop(jl_ptls_t ptls)
2960+
STATIC_INLINE int gc_use_spin_master_sched(void)
28522961
{
2853-
if (jl_n_markthreads == 0 || gc_heap_snapshot_enabled) {
2854-
gc_mark_loop_serial(ptls);
2962+
// Use the spin master scheduler if there are at least 8 (3 GC + 1 mutator)
2963+
// threads that are able to run the GC mark-loop
2964+
return (jl_n_markthreads >= 7);
2965+
}
2966+
2967+
void gc_mark_loop_worker(jl_ptls_t ptls)
2968+
{
2969+
if (gc_use_spin_master_sched()) {
2970+
gc_mark_loop_worker_spin_master(ptls);
28552971
}
28562972
else {
2857-
gc_mark_loop_parallel(ptls, 1);
2973+
gc_mark_loop_worker_exp_backoff(ptls);
28582974
}
28592975
}
28602976

2861-
void gc_mark_loop_barrier(void)
2977+
void gc_mark_loop_parallel(jl_ptls_t ptls)
28622978
{
2863-
jl_atomic_store(&gc_master_tid, -1);
2864-
while (jl_atomic_load(&gc_n_threads_marking) != 0) {
2865-
jl_cpu_pause();
2979+
jl_atomic_store(&gc_mutator_aux_tid, ptls->tid);
2980+
if (gc_use_spin_master_sched()) {
2981+
gc_spin_master_sched();
2982+
jl_atomic_store(&gc_mutator_aux_tid, -1);
2983+
}
2984+
else {
2985+
gc_exp_backoff_sched(ptls);
2986+
gc_mark_loop_worker_exp_backoff(ptls);
2987+
// Wait for all threads to finish
2988+
jl_atomic_store(&gc_mutator_aux_tid, -1);
2989+
while (jl_atomic_load(&gc_n_threads_marking) > 0) {
2990+
jl_cpu_pause();
2991+
}
2992+
}
2993+
}
2994+
2995+
void gc_mark_loop(jl_ptls_t ptls)
2996+
{
2997+
if (jl_n_markthreads == 0 || gc_heap_snapshot_enabled) {
2998+
gc_mark_loop_serial(ptls);
2999+
}
3000+
else {
3001+
gc_mark_loop_parallel(ptls);
28663002
}
28673003
}
28683004

@@ -3183,7 +3319,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
31833319
gc_cblist_root_scanner, (collection));
31843320
}
31853321
gc_mark_loop(ptls);
3186-
gc_mark_loop_barrier();
31873322
gc_mark_clean_reclaim_sets();
31883323

31893324
// 4. check for objects to finalize
@@ -3593,9 +3728,8 @@ void jl_gc_init(void)
35933728
JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
35943729
uv_mutex_init(&gc_cache_lock);
35953730
uv_mutex_init(&gc_perm_lock);
3596-
uv_mutex_init(&gc_threads_lock);
3597-
uv_cond_init(&gc_threads_cond);
35983731
uv_sem_init(&gc_sweep_assists_needed, 0);
3732+
jl_atomic_store(&gc_mutator_aux_tid, -1);
35993733

36003734
jl_gc_init_page();
36013735
jl_gc_debug_init();

src/gc.h

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -190,19 +190,6 @@ extern jl_gc_global_page_pool_t global_page_pool_lazily_freed;
190190
extern jl_gc_global_page_pool_t global_page_pool_clean;
191191
extern jl_gc_global_page_pool_t global_page_pool_freed;
192192

193-
#define GC_BACKOFF_MIN 4
194-
#define GC_BACKOFF_MAX 12
195-
196-
STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
197-
{
198-
if (*i < GC_BACKOFF_MAX) {
199-
(*i)++;
200-
}
201-
for (int j = 0; j < (1 << *i); j++) {
202-
jl_cpu_pause();
203-
}
204-
}
205-
206193
// Lock-free stack implementation taken
207194
// from Herlihy's "The Art of Multiprocessor Programming"
208195
// XXX: this is not a general-purpose lock-free stack. We can
@@ -451,16 +438,14 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE
451438
*list = hdr;
452439
}
453440

454-
extern uv_mutex_t gc_threads_lock;
455-
extern uv_cond_t gc_threads_cond;
456441
extern uv_sem_t gc_sweep_assists_needed;
457-
extern _Atomic(int) gc_n_threads_marking;
458442
void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
459443
void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT;
460444
void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT;
461445
void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
462446
void gc_mark_loop_serial(jl_ptls_t ptls);
463-
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
447+
void gc_mark_loop_worker(jl_ptls_t ptls);
448+
void gc_mark_loop_parallel(jl_ptls_t ptls);
464449
void sweep_stack_pools(void);
465450
void jl_gc_debug_init(void);
466451

src/julia_threads.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,8 @@ typedef struct _jl_tls_states_t {
216216
#define JL_GC_STATE_SAFE 2
217217
// gc_state = 2 means the thread is running unmanaged code that can be
218218
// execute at the same time with the GC.
219+
#define JL_GC_STATE_PARALLEL 3
220+
// gc_state = 2 means the thread is running parallel GC code.
219221
_Atomic(int8_t) gc_state; // read from foreign threads
220222
// execution of certain certain impure
221223
// statements is prohibited from certain

src/partr.c

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,6 @@ void jl_init_threadinginfra(void)
107107

108108
void JL_NORETURN jl_finish_task(jl_task_t *t);
109109

110-
111-
static inline int may_mark(void) JL_NOTSAFEPOINT
112-
{
113-
return (jl_atomic_load(&gc_n_threads_marking) > 0);
114-
}
115-
116110
// gc thread mark function
117111
void jl_gc_mark_threadfun(void *arg)
118112
{
@@ -129,12 +123,7 @@ void jl_gc_mark_threadfun(void *arg)
129123
free(targ);
130124

131125
while (1) {
132-
uv_mutex_lock(&gc_threads_lock);
133-
while (!may_mark()) {
134-
uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
135-
}
136-
uv_mutex_unlock(&gc_threads_lock);
137-
gc_mark_loop_parallel(ptls, 0);
126+
gc_mark_loop_worker(ptls);
138127
}
139128
}
140129

0 commit comments

Comments
 (0)