@@ -21,8 +21,8 @@ int jl_n_sweepthreads;
2121_Atomic(int ) gc_n_threads_marking ;
2222// Number of threads sweeping
2323_Atomic(int ) gc_n_threads_sweeping ;
24- // Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
25- _Atomic(jl_gc_page_stack_t * ) gc_allocd_scratch ;
24+ // Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
25+ _Atomic(jl_gc_padded_page_stack_t * ) gc_allocd_scratch ;
2626// `tid` of mutator thread that triggered GC
2727_Atomic(int ) gc_master_tid ;
2828// `tid` of first GC thread
@@ -1586,8 +1586,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
15861586 pg -> nfree = nfree ;
15871587}
15881588
1589- void gc_sweep_wake_all (void )
1589+ // pre-scan pages to check whether there are enough pages so that's worth parallelizing
1590+ // also sweeps pages that don't need to be linearly scanned
1591+ int gc_sweep_prescan (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
15901592{
1593+ // 4MB worth of pages is worth parallelizing
1594+ const int n_pages_worth_parallel_sweep = (int )(4 * (1 << 20 ) / GC_PAGE_SZ );
1595+ int n_pages_to_scan = 0 ;
1596+ gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
1597+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1598+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1599+ if (ptls2 == NULL ) {
1600+ continue ;
1601+ }
1602+ jl_gc_page_stack_t * dest = & new_gc_allocd_scratch [ptls2 -> tid ].stack ;
1603+ jl_gc_page_stack_t tmp ;
1604+ jl_gc_pagemeta_t * tail = NULL ;
1605+ memset (& tmp , 0 , sizeof (tmp ));
1606+ while (1 ) {
1607+ jl_gc_pagemeta_t * pg = pop_lf_back_nosync (& ptls2 -> page_metadata_allocd );
1608+ if (pg == NULL ) {
1609+ break ;
1610+ }
1611+ int should_scan = 1 ;
1612+ if (!pg -> has_marked ) {
1613+ should_scan = 0 ;
1614+ }
1615+ if (!current_sweep_full && !pg -> has_young ) {
1616+ assert (!prev_sweep_full || pg -> prev_nold >= pg -> nold );
1617+ if (!prev_sweep_full || pg -> prev_nold == pg -> nold ) {
1618+ should_scan = 0 ;
1619+ }
1620+ }
1621+ if (should_scan ) {
1622+ if (tail == NULL ) {
1623+ tail = pg ;
1624+ }
1625+ n_pages_to_scan ++ ;
1626+ push_lf_back_nosync (& tmp , pg );
1627+ }
1628+ else {
1629+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
1630+ }
1631+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1632+ break ;
1633+ }
1634+ }
1635+ if (tail != NULL ) {
1636+ tail -> next = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1637+ }
1638+ ptls2 -> page_metadata_allocd = tmp ;
1639+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1640+ break ;
1641+ }
1642+ }
1643+ gc_page_serializer_destroy (& serializer );
1644+ return n_pages_to_scan >= n_pages_worth_parallel_sweep ;
1645+ }
1646+
1647+ // wake up all threads to sweep the pages
1648+ void gc_sweep_wake_all (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1649+ {
1650+ int parallel_sweep_worthwhile = gc_sweep_prescan (ptls , new_gc_allocd_scratch );
1651+ jl_atomic_store (& gc_allocd_scratch , new_gc_allocd_scratch );
1652+ if (!parallel_sweep_worthwhile ) {
1653+ return ;
1654+ }
15911655 uv_mutex_lock (& gc_threads_lock );
15921656 for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
15931657 jl_ptls_t ptls2 = gc_all_tls_states [i ];
@@ -1597,6 +1661,7 @@ void gc_sweep_wake_all(void)
15971661 uv_mutex_unlock (& gc_threads_lock );
15981662}
15991663
1664+ // wait for all threads to finish sweeping
16001665void gc_sweep_wait_for_all (void )
16011666{
16021667 jl_atomic_store (& gc_allocd_scratch , NULL );
@@ -1605,36 +1670,58 @@ void gc_sweep_wait_for_all(void)
16051670 }
16061671}
16071672
1608- void gc_sweep_pool_parallel (void )
1673+ // sweep all pools
1674+ void gc_sweep_pool_parallel (jl_ptls_t ptls )
16091675{
16101676 jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
1611- jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1677+ jl_gc_padded_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
16121678 if (allocd_scratch != NULL ) {
16131679 gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
16141680 while (1 ) {
16151681 int found_pg = 0 ;
1682+ // sequentially walk the threads and sweep the pages
16161683 for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
16171684 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1685+ // skip foreign threads that already exited
16181686 if (ptls2 == NULL ) {
16191687 continue ;
16201688 }
1621- jl_gc_page_stack_t * allocd = & allocd_scratch [t_i ];
1622- jl_gc_pagemeta_t * pg = pop_lf_back (& ptls2 -> page_metadata_allocd );
1689+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls2 -> tid ].stack ;
1690+ jl_gc_pagemeta_t * pg = try_pop_lf_back (& ptls2 -> page_metadata_allocd );
1691+ // failed steal attempt
16231692 if (pg == NULL ) {
16241693 continue ;
16251694 }
1626- gc_sweep_pool_page (& serializer , allocd , & ptls2 -> page_metadata_buffered , pg );
1695+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
16271696 found_pg = 1 ;
16281697 }
16291698 if (!found_pg ) {
1630- break ;
1699+ // check for termination
1700+ int no_more_work = 1 ;
1701+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1702+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1703+ // skip foreign threads that already exited
1704+ if (ptls2 == NULL ) {
1705+ continue ;
1706+ }
1707+ jl_gc_pagemeta_t * pg = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1708+ if (pg != NULL ) {
1709+ no_more_work = 0 ;
1710+ break ;
1711+ }
1712+ }
1713+ if (no_more_work ) {
1714+ break ;
1715+ }
16311716 }
1717+ jl_cpu_pause ();
16321718 }
16331719 gc_page_serializer_destroy (& serializer );
16341720 }
16351721 jl_atomic_fetch_add (& gc_n_threads_sweeping , -1 );
16361722}
16371723
1724+ // free all pages (i.e. through `madvise` on Linux) that were lazily freed
16381725void gc_free_pages (void )
16391726{
16401727 while (1 ) {
@@ -1659,7 +1746,7 @@ static void gc_sweep_pool(void)
16591746
16601747 // allocate enough space to hold the end of the free list chain
16611748 // for every thread and pool size
1662- jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) alloca (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1749+ jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) malloc_s (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
16631750
16641751 // update metadata of pages that were pointed to by freelist or newpages from a pool
16651752 // i.e. pages being the current allocation target
@@ -1701,17 +1788,18 @@ static void gc_sweep_pool(void)
17011788 }
17021789
17031790 // the actual sweeping
1704- jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )alloca (n_threads * sizeof (jl_gc_page_stack_t ));
1705- memset (tmp , 0 , n_threads * sizeof (jl_gc_page_stack_t ));
1706- jl_atomic_store ( & gc_allocd_scratch , tmp ) ;
1707- gc_sweep_wake_all ();
1708- gc_sweep_pool_parallel ();
1791+ jl_gc_padded_page_stack_t * new_gc_allocd_scratch = (jl_gc_padded_page_stack_t * ) malloc_s (n_threads * sizeof (jl_gc_padded_page_stack_t ));
1792+ memset (new_gc_allocd_scratch , 0 , n_threads * sizeof (jl_gc_padded_page_stack_t ));
1793+ jl_ptls_t ptls = jl_current_task -> ptls ;
1794+ gc_sweep_wake_all (ptls , new_gc_allocd_scratch );
1795+ gc_sweep_pool_parallel (ptls );
17091796 gc_sweep_wait_for_all ();
17101797
1798+ // reset half-pages pointers
17111799 for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
17121800 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
17131801 if (ptls2 != NULL ) {
1714- ptls2 -> page_metadata_allocd = tmp [t_i ];
1802+ ptls2 -> page_metadata_allocd = new_gc_allocd_scratch [t_i ]. stack ;
17151803 for (int i = 0 ; i < JL_GC_N_POOLS ; i ++ ) {
17161804 jl_gc_pool_t * p = & ptls2 -> heap .norm_pools [i ];
17171805 p -> newpages = NULL ;
@@ -1749,6 +1837,10 @@ static void gc_sweep_pool(void)
17491837 }
17501838 }
17511839
1840+ // cleanup
1841+ free (pfl );
1842+ free (new_gc_allocd_scratch );
1843+
17521844#ifdef _P64 // only enable concurrent sweeping on 64bit
17531845 // wake thread up to sweep concurrently
17541846 if (jl_n_sweepthreads > 0 ) {
0 commit comments