66#include "options.h"
77#include "stdio.h"
88
9+ #if defined(USE_TRACY ) || defined(USE_ITTAPI )
10+ #define DISABLE_FREQUENT_EVENTS
11+ #endif
12+
913jl_module_t * jl_module_root (jl_module_t * m );
1014
1115#ifdef __cplusplus
@@ -19,72 +23,77 @@ extern "C" {
1923#endif
2024
2125static uint64_t t0 ;
22- #if defined(USE_TRACY ) || defined(USE_ITTAPI )
23- /**
24- * These sources often generate millions of events / minute. Although Tracy
25- * can generally keep up with that, those events also bloat the saved ".tracy"
26- * files, so we disable them by default.
27- **/
28- JL_DLLEXPORT uint64_t jl_timing_enable_mask = ~((1ull << JL_TIMING_ROOT ) |
29- (1ull << JL_TIMING_TYPE_CACHE_LOOKUP ) |
30- (1ull << JL_TIMING_METHOD_MATCH ) |
31- (1ull << JL_TIMING_METHOD_LOOKUP_FAST ) |
32- (1ull << JL_TIMING_AST_COMPRESS ) |
33- (1ull << JL_TIMING_AST_UNCOMPRESS ));
34- #else
35- JL_DLLEXPORT uint64_t jl_timing_enable_mask = ~0ull ;
36- #endif
3726
38- JL_DLLEXPORT uint64_t jl_timing_counts [(int )JL_TIMING_LAST ] = {0 };
27+ JL_DLLEXPORT _Atomic(uint64_t ) jl_timing_disable_mask [(JL_TIMING_LAST + sizeof (uint64_t ) * CHAR_BIT - 1 ) / (sizeof (uint64_t ) * CHAR_BIT )];
28+
29+ JL_DLLEXPORT _Atomic(uint64_t ) jl_timing_self_counts [(int )JL_TIMING_EVENT_LAST ];
30+ JL_DLLEXPORT _Atomic(uint64_t ) jl_timing_full_counts [(int )JL_TIMING_EVENT_LAST ];
3931
4032// Used to as an item limit when several strings of metadata can
4133// potentially be associated with a single timing zone.
4234JL_DLLEXPORT uint32_t jl_timing_print_limit = 10 ;
4335
44- const char * jl_timing_names [(int )JL_TIMING_LAST ] =
36+ static const char * jl_timing_names [(int )JL_TIMING_EVENT_LAST ] =
4537 {
4638#define X (name ) #name ,
47- JL_TIMING_SUBSYSTEMS
39+ JL_TIMING_EVENTS
4840#undef X
4941 };
5042
43+ static int jl_timing_names_sorted [(int )JL_TIMING_EVENT_LAST ];
44+
5145JL_DLLEXPORT jl_timing_counter_t jl_timing_counters [JL_TIMING_COUNTER_LAST ];
5246
5347void jl_print_timings (void )
5448{
5549#ifdef USE_TIMING_COUNTS
5650 uint64_t total_time = cycleclock () - t0 ;
5751 uint64_t root_time = total_time ;
58- for (int i = 0 ; i < JL_TIMING_LAST ; i ++ ) {
59- root_time -= jl_timing_counts [ i ] ;
52+ for (int i = 0 ; i < JL_TIMING_EVENT_LAST ; i ++ ) {
53+ root_time -= jl_atomic_load_relaxed ( jl_timing_self_counts + i ) ;
6054 }
61- jl_timing_counts [0 ] = root_time ;
55+ jl_atomic_store_relaxed (jl_timing_self_counts + JL_TIMING_ROOT , root_time );
56+ jl_atomic_store_relaxed (jl_timing_full_counts + JL_TIMING_ROOT , total_time );
6257 fprintf (stderr , "\nJULIA TIMINGS\n" );
63- for (int i = 0 ; i < JL_TIMING_LAST ; i ++ ) {
64- if (jl_timing_counts [i ] != 0 )
65- fprintf (stderr , "%-25s : %5.2f %% %" PRIu64 "\n" , jl_timing_names [i ],
66- 100 * (((double )jl_timing_counts [i ]) / total_time ), jl_timing_counts [i ]);
58+ fprintf (stderr , "%-25s, %-30s, %-30s\n" , "Event" , "Self Cycles (% of Total)" , "Total Cycles (% of Total)" );
59+ for (int i = 0 ; i < JL_TIMING_EVENT_LAST ; i ++ ) {
60+ int j = jl_timing_names_sorted [i ];
61+ uint64_t self = jl_atomic_load_relaxed (jl_timing_self_counts + j );
62+ uint64_t total = jl_atomic_load_relaxed (jl_timing_full_counts + j );
63+ if (total != 0 )
64+ fprintf (stderr , "%-25s, %20" PRIu64 " (%5.2f %%), %20" PRIu64 " (%5.2f %%)\n" , jl_timing_names [j ], self , 100 * (((double )self ) / total_time ), total , 100 * (((double )total ) / total_time ));
6765 }
6866
6967 fprintf (stderr , "\nJULIA COUNTERS\n" );
68+ fprintf (stderr , "%-25s, %-20s\n" , "Counter" , "Value" );
7069#define X (name ) do { \
7170 int64_t val = (int64_t) jl_atomic_load_relaxed(&jl_timing_counters[(int)JL_TIMING_COUNTER_##name].basic_counter); \
7271 if (val != 0) \
73- fprintf(stderr, "%-25s : % " PRIi64 "\n", #name, val); \
72+ fprintf(stderr, "%-25s, %20 " PRIi64 "\n", #name, val); \
7473 } while (0);
7574
7675 JL_TIMING_COUNTERS
7776#undef X
7877#endif
7978}
8079
80+ int cmp_names (const void * a , const void * b ) {
81+ int ia = * (const int * )a ;
82+ int ib = * (const int * )b ;
83+ return strcmp (jl_timing_names [ia ], jl_timing_names [ib ]);
84+ }
85+
8186void jl_init_timing (void )
8287{
8388 t0 = cycleclock ();
8489
85- _Static_assert (JL_TIMING_EVENT_LAST < sizeof (uint64_t ) * CHAR_BIT , "Too many timing events!" );
8690 _Static_assert ((int )JL_TIMING_LAST <= (int )JL_TIMING_EVENT_LAST , "More owners than events!" );
8791
92+ for (int i = 0 ; i < JL_TIMING_EVENT_LAST ; i ++ ) {
93+ jl_timing_names_sorted [i ] = i ;
94+ }
95+ qsort (jl_timing_names_sorted , JL_TIMING_EVENT_LAST , sizeof (int ), cmp_names );
96+
8897 int i __attribute__((unused )) = 0 ;
8998#ifdef USE_ITTAPI
9099 i = 0 ;
@@ -106,6 +115,22 @@ void jl_init_timing(void)
106115 TracyCPlotConfig (jl_timing_counters [JL_TIMING_COUNTER_JITDataSize ].tracy_counter .name , TracyPlotFormatMemory , /* rectilinear */ 0 , /* fill */ 1 , /* color */ 0 );
107116 TracyCPlotConfig (jl_timing_counters [JL_TIMING_COUNTER_ImageSize ].tracy_counter .name , TracyPlotFormatMemory , /* rectilinear */ 0 , /* fill */ 1 , /* color */ 0 );
108117#endif
118+
119+ /**
120+ * These sources often generate millions of events / minute. Although Tracy
121+ * can generally keep up with that, those events also bloat the saved ".tracy"
122+ * files, so we disable them by default.
123+ **/
124+ #ifdef DISABLE_FREQUENT_EVENTS
125+ #define DISABLE_SUBSYSTEM (subsystem ) jl_atomic_fetch_or_relaxed(jl_timing_disable_mask + (JL_TIMING_##subsystem / (sizeof(uint64_t) * CHAR_BIT)), 1 << (JL_TIMING_##subsystem % (sizeof(uint64_t) * CHAR_BIT)))
126+ DISABLE_SUBSYSTEM (ROOT );
127+ DISABLE_SUBSYSTEM (TYPE_CACHE_LOOKUP );
128+ DISABLE_SUBSYSTEM (METHOD_MATCH );
129+ DISABLE_SUBSYSTEM (METHOD_LOOKUP_FAST );
130+ DISABLE_SUBSYSTEM (AST_COMPRESS );
131+ DISABLE_SUBSYSTEM (AST_UNCOMPRESS );
132+ #endif
133+
109134}
110135
111136void jl_destroy_timing (void )
@@ -131,7 +156,7 @@ void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t
131156
132157 ptls -> timing_stack = prev_blk ;
133158 if (prev_blk != NULL ) {
134- _COUNTS_START (& prev_blk -> counts_ctx , cycleclock ());
159+ _COUNTS_RESUME (& prev_blk -> counts_ctx , cycleclock ());
135160 }
136161 }
137162
@@ -164,7 +189,7 @@ jl_timing_block_t *jl_timing_block_exit_task(jl_task_t *ct, jl_ptls_t ptls)
164189 ptls -> timing_stack = NULL ;
165190
166191 if (blk != NULL ) {
167- _COUNTS_STOP (& blk -> counts_ctx , cycleclock ());
192+ _COUNTS_PAUSE (& blk -> counts_ctx , cycleclock ());
168193 }
169194 return blk ;
170195}
@@ -330,20 +355,27 @@ void jl_timing_init_task(jl_task_t *t)
330355#endif
331356}
332357
358+ int cmp_name_idx (const void * name , const void * idx ) {
359+ return strcmp ((const char * )name , jl_timing_names [* (const int * )idx ]);
360+ }
361+
333362JL_DLLEXPORT int jl_timing_set_enable (const char * subsystem , uint8_t enabled )
334363{
335- for (int i = 0 ; i < JL_TIMING_LAST ; i ++ ) {
336- if (strcmp (subsystem , jl_timing_names [i ]) == 0 ) {
337- uint64_t subsystem_bit = (1ul << i );
338- if (enabled ) {
339- jl_timing_enable_mask |= subsystem_bit ;
340- } else {
341- jl_timing_enable_mask &= ~subsystem_bit ;
342- }
343- return 0 ;
344- }
364+ const int * idx = (const int * )bsearch (subsystem , jl_timing_names_sorted , JL_TIMING_EVENT_LAST , sizeof (int ), cmp_name_idx );
365+ if (idx == NULL )
366+ return -1 ;
367+ int i = * idx ;
368+ // sorted names include events, so skip if we're looking at an event instead of a subsystem
369+ // events are always at least JL_TIMING_LAST
370+ if (i >= JL_TIMING_LAST )
371+ return -1 ;
372+ uint64_t subsystem_bit = 1ul << (i % (sizeof (uint64_t ) * CHAR_BIT ));
373+ if (enabled ) {
374+ jl_atomic_fetch_and_relaxed (jl_timing_disable_mask + (i / (sizeof (uint64_t ) * CHAR_BIT )), ~subsystem_bit );
375+ } else {
376+ jl_atomic_fetch_or_relaxed (jl_timing_disable_mask + (i / (sizeof (uint64_t ) * CHAR_BIT )), subsystem_bit );
345377 }
346- return -1 ;
378+ return 0 ;
347379}
348380
349381static void jl_timing_set_enable_from_env (void )
0 commit comments