@@ -16,15 +16,12 @@ extern "C" {
1616int jl_n_markthreads ;
1717// Number of GC threads that may run concurrent sweeping (0 or 1)
1818int jl_n_sweepthreads ;
19- // Number of threads currently running the GC mark-loop
20- _Atomic(int ) gc_n_threads_marking ;
2119// `tid` of mutator thread that triggered GC
22- _Atomic(int ) gc_master_tid ;
20+ _Atomic(int ) gc_mutator_aux_tid ;
2321// `tid` of first GC thread
2422int gc_first_tid ;
25- // Mutex/cond used to synchronize wakeup of GC threads on parallel marking
26- uv_mutex_t gc_threads_lock ;
27- uv_cond_t gc_threads_cond ;
23+ // Number of threads running the GC mark-loop
24+ _Atomic(int ) gc_n_threads_marking ;
2825// To indicate whether concurrent sweeping should run
2926uv_sem_t gc_sweep_assists_needed ;
3027
@@ -2745,10 +2742,10 @@ JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls)
27452742void gc_mark_and_steal (jl_ptls_t ptls )
27462743{
27472744 jl_gc_markqueue_t * mq = & ptls -> mark_queue ;
2748- jl_gc_markqueue_t * mq_master = NULL ;
2749- int master_tid = jl_atomic_load (& gc_master_tid );
2750- if (master_tid != -1 )
2751- mq_master = & gc_all_tls_states [master_tid ]-> mark_queue ;
2745+ jl_gc_markqueue_t * mq_mutator_aux = NULL ;
2746+ int mutator_aux_tid = jl_atomic_load (& gc_mutator_aux_tid );
2747+ if (mutator_aux_tid != -1 )
2748+ mq_mutator_aux = & gc_all_tls_states [mutator_aux_tid ]-> mark_queue ;
27522749 void * new_obj ;
27532750 jl_gc_chunk_t c ;
27542751 pop : {
@@ -2792,8 +2789,8 @@ void gc_mark_and_steal(jl_ptls_t ptls)
27922789 }
27932790 }
27942791 // Try to steal chunk from master thread
2795- if (mq_master != NULL ) {
2796- c = gc_chunkqueue_steal_from (mq_master );
2792+ if (mq_mutator_aux != NULL ) {
2793+ c = gc_chunkqueue_steal_from (mq_mutator_aux );
27972794 if (c .cid != GC_empty_chunk ) {
27982795 gc_mark_chunk (ptls , mq , & c );
27992796 goto pop ;
@@ -2815,31 +2812,143 @@ void gc_mark_and_steal(jl_ptls_t ptls)
28152812 goto mark ;
28162813 }
28172814 // Try to steal pointer from master thread
2818- if (mq_master != NULL ) {
2819- new_obj = gc_ptr_queue_steal_from (mq_master );
2815+ if (mq_mutator_aux != NULL ) {
2816+ new_obj = gc_ptr_queue_steal_from (mq_mutator_aux );
28202817 if (new_obj != NULL )
28212818 goto mark ;
28222819 }
28232820 }
28242821}
28252822
2826- void gc_mark_loop_parallel (jl_ptls_t ptls , int master )
2823+ #define GC_PTR_MARK_WORK (1)
2824+ #define GC_CHUNK_MARK_WORK (1 << 10)
2825+ #define GC_MARK_WORK_TO_N_THREADS (1 << 3)
2826+
2827+ int64_t gc_estimate_mark_work_in_queue (jl_ptls_t ptls ) JL_NOTSAFEPOINT
28272828{
2828- int backoff = GC_BACKOFF_MIN ;
2829- if (master ) {
2830- jl_atomic_store (& gc_master_tid , ptls -> tid );
2831- // Wake threads up and try to do some work
2832- uv_mutex_lock (& gc_threads_lock );
2833- jl_atomic_fetch_add (& gc_n_threads_marking , 1 );
2834- uv_cond_broadcast (& gc_threads_cond );
2835- uv_mutex_unlock (& gc_threads_lock );
2829+ int64_t work = 0 ;
2830+ work += (jl_atomic_load_relaxed (& ptls -> mark_queue .ptr_queue .bottom ) -
2831+ jl_atomic_load_relaxed (& ptls -> mark_queue .ptr_queue .top )) * GC_PTR_MARK_WORK ;
2832+ work += (jl_atomic_load_relaxed (& ptls -> mark_queue .chunk_queue .bottom ) -
2833+ jl_atomic_load_relaxed (& ptls -> mark_queue .chunk_queue .top )) * GC_CHUNK_MARK_WORK ;
2834+ return work ;
2835+ }
2836+
2837+ int64_t gc_estimate_mark_work (void ) JL_NOTSAFEPOINT
2838+ {
2839+ int64_t work = 0 ;
2840+ for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
2841+ jl_ptls_t ptls2 = gc_all_tls_states [i ];
2842+ work += gc_estimate_mark_work_in_queue (ptls2 );
2843+ }
2844+ int mutator_aux_tid = jl_atomic_load (& gc_mutator_aux_tid );
2845+ if (mutator_aux_tid != -1 ) {
2846+ jl_ptls_t ptls2 = gc_all_tls_states [mutator_aux_tid ];
2847+ work += gc_estimate_mark_work_in_queue (ptls2 );
2848+ }
2849+ return work ;
2850+ }
2851+
2852+ int64_t gc_n_threads_marking_ub (void )
2853+ {
2854+ int64_t n_threads_marking_ub = 0 ;
2855+ for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
2856+ jl_ptls_t ptls2 = gc_all_tls_states [i ];
2857+ if (jl_atomic_load (& ptls2 -> gc_state ) == JL_GC_STATE_PARALLEL ) {
2858+ n_threads_marking_ub ++ ;
2859+ }
2860+ }
2861+ return n_threads_marking_ub ;
2862+ }
2863+
2864+ void gc_wake_mark_thread (jl_ptls_t ptls2 )
2865+ {
2866+ uv_mutex_lock (& ptls2 -> sleep_lock );
2867+ jl_atomic_store (& ptls2 -> gc_state , JL_GC_STATE_PARALLEL );
2868+ uv_cond_signal (& ptls2 -> wake_signal );
2869+ uv_mutex_unlock (& ptls2 -> sleep_lock );
2870+ }
2871+
2872+ // Spin master scheduler: based on Hassenein's
2873+ // `Understanding and Improving JVM GC Work Stealing at the Data Center Scale`
2874+ void gc_spin_master_sched (void )
2875+ {
2876+ while (1 ) {
2877+ int64_t work = gc_estimate_mark_work ();
2878+ int64_t n_threads_marking_ub = gc_n_threads_marking_ub ();
2879+ // parallel marking should terminate
2880+ if (work == 0 && n_threads_marking_ub == 0 ) {
2881+ return ;
2882+ }
2883+ // too much work for too few threads
2884+ if (work >= n_threads_marking_ub * GC_MARK_WORK_TO_N_THREADS ) {
2885+ int64_t n_threads_marking_ideal = work / GC_MARK_WORK_TO_N_THREADS ;
2886+ // try to convert GC threads to workers
2887+ for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
2888+ jl_ptls_t ptls2 = gc_all_tls_states [i ];
2889+ if (jl_atomic_load (& ptls2 -> gc_state ) == JL_GC_STATE_WAITING ) {
2890+ gc_wake_mark_thread (ptls2 );
2891+ n_threads_marking_ub ++ ;
2892+ if (n_threads_marking_ub >= n_threads_marking_ideal ) {
2893+ break ;
2894+ }
2895+ }
2896+ }
2897+ }
2898+ jl_cpu_pause ();
2899+ }
2900+ }
2901+
2902+ #define GC_BACKOFF_MIN 4
2903+ #define GC_BACKOFF_MAX 12
2904+
2905+ STATIC_INLINE void gc_backoff (int * i ) JL_NOTSAFEPOINT
2906+ {
2907+ if (* i < GC_BACKOFF_MAX ) {
2908+ (* i )++ ;
2909+ }
2910+ for (int j = 0 ; j < (1 << * i ); j ++ ) {
2911+ jl_cpu_pause ();
2912+ }
2913+ }
2914+
2915+ void gc_exp_backoff_sched (jl_ptls_t ptls )
2916+ {
2917+ // Wake threads up and try to do some work
2918+ jl_atomic_fetch_add (& gc_n_threads_marking , 1 );
2919+ for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
2920+ jl_ptls_t ptls2 = gc_all_tls_states [i ];
2921+ gc_wake_mark_thread (ptls2 );
2922+ }
2923+ gc_mark_and_steal (ptls );
2924+ jl_atomic_fetch_add (& gc_n_threads_marking , -1 );
2925+ }
2926+
2927+ STATIC_INLINE int gc_may_mark (jl_ptls_t ptls )
2928+ {
2929+ return (jl_atomic_load (& ptls -> gc_state ) == JL_GC_STATE_PARALLEL ) || (jl_atomic_load (& gc_n_threads_marking ) > 0 );
2930+ }
2931+
2932+ void gc_mark_loop_worker_spin_master (jl_ptls_t ptls )
2933+ {
2934+ while (1 ) {
2935+ uv_mutex_lock (& ptls -> sleep_lock );
2936+ while (!gc_may_mark (ptls )) {
2937+ uv_cond_wait (& ptls -> wake_signal , & ptls -> sleep_lock );
2938+ }
2939+ uv_mutex_unlock (& ptls -> sleep_lock );
28362940 gc_mark_and_steal (ptls );
2837- jl_atomic_fetch_add ( & gc_n_threads_marking , -1 );
2941+ jl_atomic_store ( & ptls -> gc_state , JL_GC_STATE_WAITING );
28382942 }
2943+ }
2944+
2945+ void gc_mark_loop_worker_exp_backoff (jl_ptls_t ptls )
2946+ {
2947+ int backoff = GC_BACKOFF_MIN ;
28392948 while (jl_atomic_load (& gc_n_threads_marking ) > 0 ) {
28402949 // Try to become a thief while other threads are marking
28412950 jl_atomic_fetch_add (& gc_n_threads_marking , 1 );
2842- if (jl_atomic_load (& gc_master_tid ) != -1 ) {
2951+ if (jl_atomic_load (& gc_mutator_aux_tid ) != -1 ) {
28432952 gc_mark_and_steal (ptls );
28442953 }
28452954 jl_atomic_fetch_add (& gc_n_threads_marking , -1 );
@@ -2848,21 +2957,48 @@ void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
28482957 }
28492958}
28502959
2851- void gc_mark_loop ( jl_ptls_t ptls )
2960+ STATIC_INLINE int gc_use_spin_master_sched ( void )
28522961{
2853- if (jl_n_markthreads == 0 || gc_heap_snapshot_enabled ) {
2854- gc_mark_loop_serial (ptls );
2962+ // Use the spin master scheduler if there are at least 8 (3 GC + 1 mutator)
2963+ // threads that are able to run the GC mark-loop
2964+ return (jl_n_markthreads >= 7 );
2965+ }
2966+
2967+ void gc_mark_loop_worker (jl_ptls_t ptls )
2968+ {
2969+ if (gc_use_spin_master_sched ()) {
2970+ gc_mark_loop_worker_spin_master (ptls );
28552971 }
28562972 else {
2857- gc_mark_loop_parallel (ptls , 1 );
2973+ gc_mark_loop_worker_exp_backoff (ptls );
28582974 }
28592975}
28602976
2861- void gc_mark_loop_barrier ( void )
2977+ void gc_mark_loop_parallel ( jl_ptls_t ptls )
28622978{
2863- jl_atomic_store (& gc_master_tid , -1 );
2864- while (jl_atomic_load (& gc_n_threads_marking ) != 0 ) {
2865- jl_cpu_pause ();
2979+ jl_atomic_store (& gc_mutator_aux_tid , ptls -> tid );
2980+ if (gc_use_spin_master_sched ()) {
2981+ gc_spin_master_sched ();
2982+ jl_atomic_store (& gc_mutator_aux_tid , -1 );
2983+ }
2984+ else {
2985+ gc_exp_backoff_sched (ptls );
2986+ gc_mark_loop_worker_exp_backoff (ptls );
2987+ // Wait for all threads to finish
2988+ jl_atomic_store (& gc_mutator_aux_tid , -1 );
2989+ while (jl_atomic_load (& gc_n_threads_marking ) > 0 ) {
2990+ jl_cpu_pause ();
2991+ }
2992+ }
2993+ }
2994+
2995+ void gc_mark_loop (jl_ptls_t ptls )
2996+ {
2997+ if (jl_n_markthreads == 0 || gc_heap_snapshot_enabled ) {
2998+ gc_mark_loop_serial (ptls );
2999+ }
3000+ else {
3001+ gc_mark_loop_parallel (ptls );
28663002 }
28673003}
28683004
@@ -3183,7 +3319,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
31833319 gc_cblist_root_scanner , (collection ));
31843320 }
31853321 gc_mark_loop (ptls );
3186- gc_mark_loop_barrier ();
31873322 gc_mark_clean_reclaim_sets ();
31883323
31893324 // 4. check for objects to finalize
@@ -3593,9 +3728,8 @@ void jl_gc_init(void)
35933728 JL_MUTEX_INIT (& finalizers_lock , "finalizers_lock" );
35943729 uv_mutex_init (& gc_cache_lock );
35953730 uv_mutex_init (& gc_perm_lock );
3596- uv_mutex_init (& gc_threads_lock );
3597- uv_cond_init (& gc_threads_cond );
35983731 uv_sem_init (& gc_sweep_assists_needed , 0 );
3732+ jl_atomic_store (& gc_mutator_aux_tid , -1 );
35993733
36003734 jl_gc_init_page ();
36013735 jl_gc_debug_init ();
0 commit comments