Skip to content

Commit 94bc2f3

Browse files
Merge pull request #49679 from JuliaLang/pc/timing-events
2 parents aea56a9 + e370e4e commit 94bc2f3

File tree

2 files changed

+102
-64
lines changed

2 files changed

+102
-64
lines changed

src/timing.c

Lines changed: 72 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
#include "options.h"
77
#include "stdio.h"
88

9+
#if defined(USE_TRACY) || defined(USE_ITTAPI)
10+
#define DISABLE_FREQUENT_EVENTS
11+
#endif
12+
913
jl_module_t *jl_module_root(jl_module_t *m);
1014

1115
#ifdef __cplusplus
@@ -19,72 +23,77 @@ extern "C" {
1923
#endif
2024

2125
static uint64_t t0;
22-
#if defined(USE_TRACY) || defined(USE_ITTAPI)
23-
/**
24-
* These sources often generate millions of events / minute. Although Tracy
25-
* can generally keep up with that, those events also bloat the saved ".tracy"
26-
* files, so we disable them by default.
27-
**/
28-
JL_DLLEXPORT uint64_t jl_timing_enable_mask = ~((1ull << JL_TIMING_ROOT) |
29-
(1ull << JL_TIMING_TYPE_CACHE_LOOKUP) |
30-
(1ull << JL_TIMING_METHOD_MATCH) |
31-
(1ull << JL_TIMING_METHOD_LOOKUP_FAST) |
32-
(1ull << JL_TIMING_AST_COMPRESS) |
33-
(1ull << JL_TIMING_AST_UNCOMPRESS));
34-
#else
35-
JL_DLLEXPORT uint64_t jl_timing_enable_mask = ~0ull;
36-
#endif
3726

38-
JL_DLLEXPORT uint64_t jl_timing_counts[(int)JL_TIMING_LAST] = {0};
27+
JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)];
28+
29+
JL_DLLEXPORT _Atomic(uint64_t) jl_timing_self_counts[(int)JL_TIMING_EVENT_LAST];
30+
JL_DLLEXPORT _Atomic(uint64_t) jl_timing_full_counts[(int)JL_TIMING_EVENT_LAST];
3931

4032
// Used to as an item limit when several strings of metadata can
4133
// potentially be associated with a single timing zone.
4234
JL_DLLEXPORT uint32_t jl_timing_print_limit = 10;
4335

44-
const char *jl_timing_names[(int)JL_TIMING_LAST] =
36+
static const char *jl_timing_names[(int)JL_TIMING_EVENT_LAST] =
4537
{
4638
#define X(name) #name,
47-
JL_TIMING_SUBSYSTEMS
39+
JL_TIMING_EVENTS
4840
#undef X
4941
};
5042

43+
static int jl_timing_names_sorted[(int)JL_TIMING_EVENT_LAST];
44+
5145
JL_DLLEXPORT jl_timing_counter_t jl_timing_counters[JL_TIMING_COUNTER_LAST];
5246

5347
void jl_print_timings(void)
5448
{
5549
#ifdef USE_TIMING_COUNTS
5650
uint64_t total_time = cycleclock() - t0;
5751
uint64_t root_time = total_time;
58-
for (int i = 0; i < JL_TIMING_LAST; i++) {
59-
root_time -= jl_timing_counts[i];
52+
for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) {
53+
root_time -= jl_atomic_load_relaxed(jl_timing_self_counts + i);
6054
}
61-
jl_timing_counts[0] = root_time;
55+
jl_atomic_store_relaxed(jl_timing_self_counts + JL_TIMING_ROOT, root_time);
56+
jl_atomic_store_relaxed(jl_timing_full_counts + JL_TIMING_ROOT, total_time);
6257
fprintf(stderr, "\nJULIA TIMINGS\n");
63-
for (int i = 0; i < JL_TIMING_LAST; i++) {
64-
if (jl_timing_counts[i] != 0)
65-
fprintf(stderr, "%-25s : %5.2f %% %" PRIu64 "\n", jl_timing_names[i],
66-
100 * (((double)jl_timing_counts[i]) / total_time), jl_timing_counts[i]);
58+
fprintf(stderr, "%-25s, %-30s, %-30s\n", "Event", "Self Cycles (% of Total)", "Total Cycles (% of Total)");
59+
for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) {
60+
int j = jl_timing_names_sorted[i];
61+
uint64_t self = jl_atomic_load_relaxed(jl_timing_self_counts + j);
62+
uint64_t total = jl_atomic_load_relaxed(jl_timing_full_counts + j);
63+
if (total != 0)
64+
fprintf(stderr, "%-25s, %20" PRIu64 " (%5.2f %%), %20" PRIu64 " (%5.2f %%)\n", jl_timing_names[j], self, 100 * (((double)self) / total_time), total, 100 * (((double)total) / total_time));
6765
}
6866

6967
fprintf(stderr, "\nJULIA COUNTERS\n");
68+
fprintf(stderr, "%-25s, %-20s\n", "Counter", "Value");
7069
#define X(name) do { \
7170
int64_t val = (int64_t) jl_atomic_load_relaxed(&jl_timing_counters[(int)JL_TIMING_COUNTER_##name].basic_counter); \
7271
if (val != 0) \
73-
fprintf(stderr, "%-25s : %" PRIi64 "\n", #name, val); \
72+
fprintf(stderr, "%-25s, %20" PRIi64 "\n", #name, val); \
7473
} while (0);
7574

7675
JL_TIMING_COUNTERS
7776
#undef X
7877
#endif
7978
}
8079

80+
int cmp_names(const void *a, const void *b) {
81+
int ia = *(const int*)a;
82+
int ib = *(const int*)b;
83+
return strcmp(jl_timing_names[ia], jl_timing_names[ib]);
84+
}
85+
8186
void jl_init_timing(void)
8287
{
8388
t0 = cycleclock();
8489

85-
_Static_assert(JL_TIMING_EVENT_LAST < sizeof(uint64_t) * CHAR_BIT, "Too many timing events!");
8690
_Static_assert((int)JL_TIMING_LAST <= (int)JL_TIMING_EVENT_LAST, "More owners than events!");
8791

92+
for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) {
93+
jl_timing_names_sorted[i] = i;
94+
}
95+
qsort(jl_timing_names_sorted, JL_TIMING_EVENT_LAST, sizeof(int), cmp_names);
96+
8897
int i __attribute__((unused)) = 0;
8998
#ifdef USE_ITTAPI
9099
i = 0;
@@ -106,6 +115,22 @@ void jl_init_timing(void)
106115
TracyCPlotConfig(jl_timing_counters[JL_TIMING_COUNTER_JITDataSize].tracy_counter.name, TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0);
107116
TracyCPlotConfig(jl_timing_counters[JL_TIMING_COUNTER_ImageSize].tracy_counter.name, TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0);
108117
#endif
118+
119+
/**
120+
* These sources often generate millions of events / minute. Although Tracy
121+
* can generally keep up with that, those events also bloat the saved ".tracy"
122+
* files, so we disable them by default.
123+
**/
124+
#ifdef DISABLE_FREQUENT_EVENTS
125+
#define DISABLE_SUBSYSTEM(subsystem) jl_atomic_fetch_or_relaxed(jl_timing_disable_mask + (JL_TIMING_##subsystem / (sizeof(uint64_t) * CHAR_BIT)), 1 << (JL_TIMING_##subsystem % (sizeof(uint64_t) * CHAR_BIT)))
126+
DISABLE_SUBSYSTEM(ROOT);
127+
DISABLE_SUBSYSTEM(TYPE_CACHE_LOOKUP);
128+
DISABLE_SUBSYSTEM(METHOD_MATCH);
129+
DISABLE_SUBSYSTEM(METHOD_LOOKUP_FAST);
130+
DISABLE_SUBSYSTEM(AST_COMPRESS);
131+
DISABLE_SUBSYSTEM(AST_UNCOMPRESS);
132+
#endif
133+
109134
}
110135

111136
void jl_destroy_timing(void)
@@ -131,7 +156,7 @@ void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t
131156

132157
ptls->timing_stack = prev_blk;
133158
if (prev_blk != NULL) {
134-
_COUNTS_START(&prev_blk->counts_ctx, cycleclock());
159+
_COUNTS_RESUME(&prev_blk->counts_ctx, cycleclock());
135160
}
136161
}
137162

@@ -164,7 +189,7 @@ jl_timing_block_t *jl_timing_block_exit_task(jl_task_t *ct, jl_ptls_t ptls)
164189
ptls->timing_stack = NULL;
165190

166191
if (blk != NULL) {
167-
_COUNTS_STOP(&blk->counts_ctx, cycleclock());
192+
_COUNTS_PAUSE(&blk->counts_ctx, cycleclock());
168193
}
169194
return blk;
170195
}
@@ -330,20 +355,27 @@ void jl_timing_init_task(jl_task_t *t)
330355
#endif
331356
}
332357

358+
int cmp_name_idx(const void *name, const void *idx) {
359+
return strcmp((const char *)name, jl_timing_names[*(const int *)idx]);
360+
}
361+
333362
JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled)
334363
{
335-
for (int i = 0; i < JL_TIMING_LAST; i++) {
336-
if (strcmp(subsystem, jl_timing_names[i]) == 0) {
337-
uint64_t subsystem_bit = (1ul << i);
338-
if (enabled) {
339-
jl_timing_enable_mask |= subsystem_bit;
340-
} else {
341-
jl_timing_enable_mask &= ~subsystem_bit;
342-
}
343-
return 0;
344-
}
364+
const int *idx = (const int *)bsearch(subsystem, jl_timing_names_sorted, JL_TIMING_EVENT_LAST, sizeof(int), cmp_name_idx);
365+
if (idx == NULL)
366+
return -1;
367+
int i = *idx;
368+
// sorted names include events, so skip if we're looking at an event instead of a subsystem
369+
// events are always at least JL_TIMING_LAST
370+
if (i >= JL_TIMING_LAST)
371+
return -1;
372+
uint64_t subsystem_bit = 1ul << (i % (sizeof(uint64_t) * CHAR_BIT));
373+
if (enabled) {
374+
jl_atomic_fetch_and_relaxed(jl_timing_disable_mask + (i / (sizeof(uint64_t) * CHAR_BIT)), ~subsystem_bit);
375+
} else {
376+
jl_atomic_fetch_or_relaxed(jl_timing_disable_mask + (i / (sizeof(uint64_t) * CHAR_BIT)), subsystem_bit);
345377
}
346-
return -1;
378+
return 0;
347379
}
348380

349381
static void jl_timing_set_enable_from_env(void)

src/timing.h

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -233,16 +233,16 @@ enum jl_timing_counter_types {
233233

234234
#ifdef USE_TIMING_COUNTS
235235
#define _COUNTS_CTX_MEMBER jl_timing_counts_t counts_ctx;
236-
#define _COUNTS_CTOR(block) _jl_timing_counts_ctor(block)
237-
#define _COUNTS_DESTROY(block, subsystem) _jl_timing_counts_destroy(block, subsystem)
238236
#define _COUNTS_START(block, t) _jl_timing_counts_start(block, t)
239-
#define _COUNTS_STOP(block, t) _jl_timing_counts_stop(block, t)
237+
#define _COUNTS_STOP(block, event, t) _jl_timing_counts_stop(block, event, t)
238+
#define _COUNTS_PAUSE(block, t) _jl_timing_counts_pause(block, t)
239+
#define _COUNTS_RESUME(block, t) _jl_timing_counts_resume(block, t)
240240
#else
241241
#define _COUNTS_CTX_MEMBER
242-
#define _COUNTS_CTOR(block)
243-
#define _COUNTS_DESTROY(block, subsystem)
244242
#define _COUNTS_START(block, t)
245-
#define _COUNTS_STOP(block, t)
243+
#define _COUNTS_STOP(block, event, t)
244+
#define _COUNTS_PAUSE(block, t)
245+
#define _COUNTS_RESUME(block, t)
246246
#endif
247247

248248
/**
@@ -286,48 +286,57 @@ enum jl_timing_counter_types {
286286
* Implementation: Aggregated counts back-end
287287
**/
288288

289-
extern JL_DLLEXPORT uint64_t jl_timing_counts[(int)JL_TIMING_LAST];
289+
extern JL_DLLEXPORT _Atomic(uint64_t) jl_timing_self_counts[(int)JL_TIMING_EVENT_LAST];
290+
extern JL_DLLEXPORT _Atomic(uint64_t) jl_timing_full_counts[(int)JL_TIMING_EVENT_LAST];
290291
typedef struct _jl_timing_counts_t {
291292
uint64_t total;
293+
uint64_t start;
292294
uint64_t t0;
293295
#ifdef JL_DEBUG_BUILD
294296
uint8_t running;
295297
#endif
296298
} jl_timing_counts_t;
297299

298-
STATIC_INLINE void _jl_timing_counts_stop(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT {
300+
STATIC_INLINE void _jl_timing_counts_pause(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT {
299301
#ifdef JL_DEBUG_BUILD
300302
assert(block->running);
301303
block->running = 0;
302304
#endif
303-
block->total += t - block->t0;
305+
block->total += t - block->start;
304306
}
305307

306-
STATIC_INLINE void _jl_timing_counts_start(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT {
308+
STATIC_INLINE void _jl_timing_counts_resume(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT {
307309
#ifdef JL_DEBUG_BUILD
308310
assert(!block->running);
309311
block->running = 1;
310312
#endif
311-
block->t0 = t;
313+
block->start = t;
312314
}
313315

314-
STATIC_INLINE void _jl_timing_counts_ctor(jl_timing_counts_t *block) JL_NOTSAFEPOINT {
316+
STATIC_INLINE void _jl_timing_counts_start(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT {
315317
block->total = 0;
318+
block->start = t;
319+
block->t0 = t;
316320
#ifdef JL_DEBUG_BUILD
317-
block->running = 0;
321+
block->running = 1;
318322
#endif
319323
}
320324

321-
STATIC_INLINE void _jl_timing_counts_destroy(jl_timing_counts_t *block, int subsystem) JL_NOTSAFEPOINT {
322-
jl_timing_counts[subsystem] += block->total;
325+
STATIC_INLINE void _jl_timing_counts_stop(jl_timing_counts_t *block, int event, uint64_t t) JL_NOTSAFEPOINT {
326+
#ifdef JL_DEBUG_BUILD
327+
assert(block->running);
328+
block->running = 0;
329+
#endif
330+
block->total += t - block->start;
331+
jl_atomic_fetch_add_relaxed(jl_timing_self_counts + event, block->total);
332+
jl_atomic_fetch_add_relaxed(jl_timing_full_counts + event, t - block->t0);
323333
}
324334

325335
/**
326336
* Top-level jl_timing implementation
327337
**/
328338

329-
extern JL_DLLEXPORT uint64_t jl_timing_enable_mask;
330-
extern const char *jl_timing_names[(int)JL_TIMING_LAST];
339+
extern JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)];
331340

332341
struct _jl_timing_block_t { // typedef in julia.h
333342
struct _jl_timing_block_t *prev;
@@ -342,7 +351,7 @@ struct _jl_timing_block_t { // typedef in julia.h
342351
};
343352

344353
STATIC_INLINE int _jl_timing_enabled(int subsystem) JL_NOTSAFEPOINT {
345-
return (jl_timing_enable_mask & (1 << subsystem)) != 0;
354+
return (jl_atomic_load_relaxed(jl_timing_disable_mask + subsystem / (sizeof(uint64_t) * CHAR_BIT)) & (1 << (subsystem % (sizeof(uint64_t) * CHAR_BIT)))) == 0;
346355
}
347356

348357
STATIC_INLINE void jl_timing_block_start(jl_timing_block_t *block) {
@@ -358,7 +367,7 @@ STATIC_INLINE void jl_timing_block_start(jl_timing_block_t *block) {
358367
block->prev = *prevp;
359368
block->is_running = 1;
360369
if (block->prev) {
361-
_COUNTS_STOP(&block->prev->counts_ctx, t);
370+
_COUNTS_PAUSE(&block->prev->counts_ctx, t);
362371
}
363372
*prevp = block;
364373
}
@@ -367,26 +376,23 @@ STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int subsystem
367376
block->subsystem = subsystem;
368377
block->event = event;
369378
block->is_running = 0;
370-
_COUNTS_CTOR(&block->counts_ctx);
371379
}
372380

373381
STATIC_INLINE void _jl_timing_block_destroy(jl_timing_block_t *block) JL_NOTSAFEPOINT {
374382
if (block->is_running) {
375383
uint64_t t = cycleclock(); (void)t;
376384
_ITTAPI_STOP(block);
377-
_COUNTS_STOP(&block->counts_ctx, t);
378385
_TRACY_STOP(block->tracy_ctx);
386+
_COUNTS_STOP(&block->counts_ctx, block->event, cycleclock());
379387

380388
jl_task_t *ct = jl_current_task;
381389
jl_timing_block_t **pcur = &ct->ptls->timing_stack;
382390
assert(*pcur == block);
383391
*pcur = block->prev;
384392
if (block->prev) {
385-
_COUNTS_START(&block->prev->counts_ctx, t);
393+
_COUNTS_RESUME(&block->prev->counts_ctx, t);
386394
}
387395
}
388-
389-
_COUNTS_DESTROY(&block->counts_ctx, block->subsystem);
390396
}
391397

392398
typedef struct _jl_timing_suspend_t {

0 commit comments

Comments
 (0)