implement spin master

d-netto · d-netto · commit cf862e4816fc · 2023-09-17T15:09:17.000-03:00
diff --git a/src/gc.c b/src/gc.c
@@ -16,15 +16,12 @@ extern "C" {
 int jl_n_markthreads;
 // Number of GC threads that may run concurrent sweeping (0 or 1)
 int jl_n_sweepthreads;
-// Number of threads currently running the GC mark-loop
-_Atomic(int) gc_n_threads_marking;
 // `tid` of mutator thread that triggered GC
-_Atomic(int) gc_master_tid;
+_Atomic(int) gc_mutator_aux_tid;
 // `tid` of first GC thread
 int gc_first_tid;
-// Mutex/cond used to synchronize wakeup of GC threads on parallel marking
-uv_mutex_t gc_threads_lock;
-uv_cond_t gc_threads_cond;
+// Number of threads running the GC mark-loop
+_Atomic(int) gc_n_threads_marking;
 // To indicate whether concurrent sweeping should run
 uv_sem_t gc_sweep_assists_needed;
 
@@ -2745,10 +2742,10 @@ JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls)
 void gc_mark_and_steal(jl_ptls_t ptls)
 {
     jl_gc_markqueue_t *mq = &ptls->mark_queue;
-    jl_gc_markqueue_t *mq_master = NULL;
-    int master_tid = jl_atomic_load(&gc_master_tid);
-    if (master_tid != -1)
-        mq_master = &gc_all_tls_states[master_tid]->mark_queue;
+    jl_gc_markqueue_t *mq_mutator_aux = NULL;
+    int mutator_aux_tid = jl_atomic_load(&gc_mutator_aux_tid);
+    if (mutator_aux_tid != -1)
+        mq_mutator_aux = &gc_all_tls_states[mutator_aux_tid]->mark_queue;
     void *new_obj;
     jl_gc_chunk_t c;
     pop : {
@@ -2792,8 +2789,8 @@ void gc_mark_and_steal(jl_ptls_t ptls)
             }
         }
         // Try to steal chunk from master thread
-        if (mq_master != NULL) {
-            c = gc_chunkqueue_steal_from(mq_master);
+        if (mq_mutator_aux != NULL) {
+            c = gc_chunkqueue_steal_from(mq_mutator_aux);
             if (c.cid != GC_empty_chunk) {
                 gc_mark_chunk(ptls, mq, &c);
                 goto pop;
@@ -2815,31 +2812,143 @@ void gc_mark_and_steal(jl_ptls_t ptls)
                 goto mark;
         }
         // Try to steal pointer from master thread
-        if (mq_master != NULL) {
-            new_obj = gc_ptr_queue_steal_from(mq_master);
+        if (mq_mutator_aux != NULL) {
+            new_obj = gc_ptr_queue_steal_from(mq_mutator_aux);
             if (new_obj != NULL)
                 goto mark;
         }
     }
 }
 
-void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
+#define GC_PTR_MARK_WORK            (1)
+#define GC_CHUNK_MARK_WORK          (1 << 10)
+#define GC_MARK_WORK_TO_N_THREADS   (1 << 3)
+
+int64_t gc_estimate_mark_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
-    int backoff = GC_BACKOFF_MIN;
-    if (master) {
-        jl_atomic_store(&gc_master_tid, ptls->tid);
-        // Wake threads up and try to do some work
-        uv_mutex_lock(&gc_threads_lock);
-        jl_atomic_fetch_add(&gc_n_threads_marking, 1);
-        uv_cond_broadcast(&gc_threads_cond);
-        uv_mutex_unlock(&gc_threads_lock);
+    int64_t work = 0;
+    work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) -
+        jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top)) * GC_PTR_MARK_WORK;
+    work += (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) -
+        jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top)) * GC_CHUNK_MARK_WORK;
+    return work;
+}
+
+int64_t gc_estimate_mark_work(void) JL_NOTSAFEPOINT
+{
+    int64_t work = 0;
+    for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        work += gc_estimate_mark_work_in_queue(ptls2);
+    }
+    int mutator_aux_tid = jl_atomic_load(&gc_mutator_aux_tid);
+    if (mutator_aux_tid != -1) {
+        jl_ptls_t ptls2 = gc_all_tls_states[mutator_aux_tid];
+        work += gc_estimate_mark_work_in_queue(ptls2);
+    }
+    return work;
+}
+
+int64_t gc_n_threads_marking_ub(void)
+{
+    int64_t n_threads_marking_ub = 0;
+    for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (jl_atomic_load(&ptls2->gc_state) == JL_GC_STATE_PARALLEL) {
+            n_threads_marking_ub++;
+        }
+    }
+    return n_threads_marking_ub;
+}
+
+void gc_wake_mark_thread(jl_ptls_t ptls2)
+{
+    uv_mutex_lock(&ptls2->sleep_lock);
+    jl_atomic_store(&ptls2->gc_state, JL_GC_STATE_PARALLEL);
+    uv_cond_signal(&ptls2->wake_signal);
+    uv_mutex_unlock(&ptls2->sleep_lock);
+}
+
+// Spin master scheduler: based on Hassenein's
+// `Understanding and Improving JVM GC Work Stealing at the Data Center Scale`
+void gc_spin_master_sched(void)
+{
+    while (1) {
+        int64_t work = gc_estimate_mark_work();
+        int64_t n_threads_marking_ub = gc_n_threads_marking_ub();
+        // parallel marking should terminate
+        if (work == 0 && n_threads_marking_ub == 0) {
+            return;
+        }
+        // too much work for too few threads
+        if (work >= n_threads_marking_ub * GC_MARK_WORK_TO_N_THREADS) {
+            int64_t n_threads_marking_ideal = work / GC_MARK_WORK_TO_N_THREADS;
+            // try to convert GC threads to workers
+            for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
+                jl_ptls_t ptls2 = gc_all_tls_states[i];
+                if (jl_atomic_load(&ptls2->gc_state) == JL_GC_STATE_WAITING) {
+                    gc_wake_mark_thread(ptls2);
+                    n_threads_marking_ub++;
+                    if (n_threads_marking_ub >= n_threads_marking_ideal) {
+                        break;
+                    }
+                }
+            }
+        }
+        jl_cpu_pause();
+    }
+}
+
+#define GC_BACKOFF_MIN 4
+#define GC_BACKOFF_MAX 12
+
+STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
+{
+    if (*i < GC_BACKOFF_MAX) {
+        (*i)++;
+    }
+    for (int j = 0; j < (1 << *i); j++) {
+        jl_cpu_pause();
+    }
+}
+
+void gc_exp_backoff_sched(jl_ptls_t ptls)
+{
+    // Wake threads up and try to do some work
+    jl_atomic_fetch_add(&gc_n_threads_marking, 1);
+    for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        gc_wake_mark_thread(ptls2);
+    }
+    gc_mark_and_steal(ptls);
+    jl_atomic_fetch_add(&gc_n_threads_marking, -1);
+}
+
+STATIC_INLINE int gc_may_mark(jl_ptls_t ptls)
+{
+    return (jl_atomic_load(&ptls->gc_state) == JL_GC_STATE_PARALLEL) || (jl_atomic_load(&gc_n_threads_marking) > 0);
+}
+
+void gc_mark_loop_worker_spin_master(jl_ptls_t ptls)
+{
+    while (1) {
+        uv_mutex_lock(&ptls->sleep_lock);
+        while (!gc_may_mark(ptls)) {
+            uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock);
+        }
+        uv_mutex_unlock(&ptls->sleep_lock);
         gc_mark_and_steal(ptls);
-        jl_atomic_fetch_add(&gc_n_threads_marking, -1);
+        jl_atomic_store(&ptls->gc_state, JL_GC_STATE_WAITING);
     }
+}
+
+void gc_mark_loop_worker_exp_backoff(jl_ptls_t ptls)
+{
+    int backoff = GC_BACKOFF_MIN;
     while (jl_atomic_load(&gc_n_threads_marking) > 0) {
         // Try to become a thief while other threads are marking
         jl_atomic_fetch_add(&gc_n_threads_marking, 1);
-        if (jl_atomic_load(&gc_master_tid) != -1) {
+        if (jl_atomic_load(&gc_mutator_aux_tid) != -1) {
             gc_mark_and_steal(ptls);
         }
         jl_atomic_fetch_add(&gc_n_threads_marking, -1);
@@ -2848,21 +2957,48 @@ void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
     }
 }
 
-void gc_mark_loop(jl_ptls_t ptls)
+STATIC_INLINE int gc_use_spin_master_sched(void)
 {
-    if (jl_n_markthreads == 0 || gc_heap_snapshot_enabled) {
-        gc_mark_loop_serial(ptls);
+    // Use the spin master scheduler if there are at least 8 (3 GC + 1 mutator)
+    // threads that are able to run the GC mark-loop
+    return (jl_n_markthreads >= 7);
+}
+
+void gc_mark_loop_worker(jl_ptls_t ptls)
+{
+    if (gc_use_spin_master_sched()) {
+        gc_mark_loop_worker_spin_master(ptls);
     }
     else {
-        gc_mark_loop_parallel(ptls, 1);
+        gc_mark_loop_worker_exp_backoff(ptls);
     }
 }
 
-void gc_mark_loop_barrier(void)
+void gc_mark_loop_parallel(jl_ptls_t ptls)
 {
-    jl_atomic_store(&gc_master_tid, -1);
-    while (jl_atomic_load(&gc_n_threads_marking) != 0) {
-        jl_cpu_pause();
+    jl_atomic_store(&gc_mutator_aux_tid, ptls->tid);
+    if (gc_use_spin_master_sched()) {
+        gc_spin_master_sched();
+        jl_atomic_store(&gc_mutator_aux_tid, -1);
+    }
+    else {
+        gc_exp_backoff_sched(ptls);
+        gc_mark_loop_worker_exp_backoff(ptls);
+        // Wait for all threads to finish
+        jl_atomic_store(&gc_mutator_aux_tid, -1);
+        while (jl_atomic_load(&gc_n_threads_marking) > 0) {
+            jl_cpu_pause();
+        }
+    }
+}
+
+void gc_mark_loop(jl_ptls_t ptls)
+{
+    if (jl_n_markthreads == 0 || gc_heap_snapshot_enabled) {
+        gc_mark_loop_serial(ptls);
+    }
+    else {
+        gc_mark_loop_parallel(ptls);
     }
 }
 
@@ -3183,7 +3319,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
                 gc_cblist_root_scanner, (collection));
         }
         gc_mark_loop(ptls);
-        gc_mark_loop_barrier();
         gc_mark_clean_reclaim_sets();
 
         // 4. check for objects to finalize
@@ -3593,9 +3728,8 @@ void jl_gc_init(void)
     JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
     uv_mutex_init(&gc_cache_lock);
     uv_mutex_init(&gc_perm_lock);
-    uv_mutex_init(&gc_threads_lock);
-    uv_cond_init(&gc_threads_cond);
     uv_sem_init(&gc_sweep_assists_needed, 0);
+    jl_atomic_store(&gc_mutator_aux_tid, -1);
 
     jl_gc_init_page();
     jl_gc_debug_init();
diff --git a/src/gc.h b/src/gc.h
@@ -190,19 +190,6 @@ extern jl_gc_global_page_pool_t global_page_pool_lazily_freed;
 extern jl_gc_global_page_pool_t global_page_pool_clean;
 extern jl_gc_global_page_pool_t global_page_pool_freed;
 
-#define GC_BACKOFF_MIN 4
-#define GC_BACKOFF_MAX 12
-
-STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
-{
-    if (*i < GC_BACKOFF_MAX) {
-        (*i)++;
-    }
-    for (int j = 0; j < (1 << *i); j++) {
-        jl_cpu_pause();
-    }
-}
-
 // Lock-free stack implementation taken
 // from Herlihy's "The Art of Multiprocessor Programming"
 // XXX: this is not a general-purpose lock-free stack. We can
@@ -451,16 +438,14 @@ STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFE
     *list = hdr;
 }
 
-extern uv_mutex_t gc_threads_lock;
-extern uv_cond_t gc_threads_cond;
 extern uv_sem_t gc_sweep_assists_needed;
-extern _Atomic(int) gc_n_threads_marking;
 void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
 void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT;
 void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT;
 void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
 void gc_mark_loop_serial(jl_ptls_t ptls);
-void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
+void gc_mark_loop_worker(jl_ptls_t ptls);
+void gc_mark_loop_parallel(jl_ptls_t ptls);
 void sweep_stack_pools(void);
 void jl_gc_debug_init(void);
 
diff --git a/src/julia_threads.h b/src/julia_threads.h
@@ -216,6 +216,8 @@ typedef struct _jl_tls_states_t {
 #define JL_GC_STATE_SAFE 2
     // gc_state = 2 means the thread is running unmanaged code that can be
     //              execute at the same time with the GC.
+#define JL_GC_STATE_PARALLEL 3
+    // gc_state = 2 means the thread is running parallel GC code.
     _Atomic(int8_t) gc_state; // read from foreign threads
     // execution of certain certain impure
     // statements is prohibited from certain
diff --git a/src/partr.c b/src/partr.c
@@ -107,12 +107,6 @@ void jl_init_threadinginfra(void)
 
 void JL_NORETURN jl_finish_task(jl_task_t *t);
 
-
-static inline int may_mark(void) JL_NOTSAFEPOINT
-{
-    return (jl_atomic_load(&gc_n_threads_marking) > 0);
-}
-
 // gc thread mark function
 void jl_gc_mark_threadfun(void *arg)
 {
@@ -129,12 +123,7 @@ void jl_gc_mark_threadfun(void *arg)
     free(targ);
 
     while (1) {
-        uv_mutex_lock(&gc_threads_lock);
-        while (!may_mark()) {
-            uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
-        }
-        uv_mutex_unlock(&gc_threads_lock);
-        gc_mark_loop_parallel(ptls, 0);
+        gc_mark_loop_worker(ptls);
     }
 }