Skip to content

Commit 10db8ab

Browse files
feat(compactor HS): add a dashboard for monitoring deletion with horizontally scalable compactor (#18588)
1 parent db72d63 commit 10db8ab

File tree

3 files changed

+178
-1
lines changed

3 files changed

+178
-1
lines changed

production/loki-mixin/dashboards.libsonnet

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@
1212
(import 'dashboards/recording-rules.libsonnet') +
1313
(import 'dashboards/loki-bloom-build.libsonnet') +
1414
(import 'dashboards/loki-bloom-gateway.libsonnet') +
15-
(import 'dashboards/loki-object-store.libsonnet')
15+
(import 'dashboards/loki-object-store.libsonnet') +
16+
(import 'dashboards/loki-deletion-horizontally-scalable.libsonnet')

production/loki-mixin/dashboards/dashboard-utils.libsonnet

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
315315
fieldConfig: {
316316
defaults: {
317317
decimals: decimals,
318+
thresholds: {
319+
mode: 'absolute',
320+
steps: thresholds,
321+
},
318322
noValue: novalue,
319323
unit: unit,
320324
},
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
local vendor_config = import 'github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet';
2+
local vendor_utils = import 'github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet';
3+
local g = import 'grafana-builder/grafana.libsonnet';
4+
local grafana = import 'grafonnet/grafana.libsonnet';
5+
6+
{
7+
local worker_pod_matcher = if $._config.meta_monitoring.enabled
8+
then $._config.per_instance_label + '=~".*compactor-worker.*"'
9+
else 'container="compactor-worker"',
10+
local worker_job_matcher = if $._config.meta_monitoring.enabled
11+
then '".*compactor-worker"' % $._config.ssd.pod_prefix_matcher
12+
else 'compactor-worker',
13+
14+
local compactor_pod_matcher = if $._config.meta_monitoring.enabled
15+
then $._config.per_instance_label + '=~"(.*compactor.*|%s-backend.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher
16+
else 'container="compactor"',
17+
local compactor_job_matcher = if $._config.meta_monitoring.enabled
18+
then '"(.*compactor|%s-backend.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher
19+
else 'compactor',
20+
21+
_config+:: {
22+
horizontally_scalable_compactor_enabled: false,
23+
},
24+
grafanaDashboards+: if !$._config.horizontally_scalable_compactor_enabled then {} else {
25+
local dashboard = (
26+
vendor_utils {
27+
_config:: vendor_config._config + $._config {
28+
product: 'Loki',
29+
dashboard_prefix: 'Loki / ',
30+
tags: ['loki'],
31+
},
32+
}
33+
),
34+
'loki-deletion-horizontally-scalable.json':
35+
// The dashboard() function automatically adds the "Loki / " prefix to the dashboard title.
36+
// This logic is inherited from mimir-mixin.
37+
dashboard.dashboard('Deletion(Horizontally Scalable)')
38+
// We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality.
39+
.addTemplate('cluster', 'loki_build_info', $._config.per_cluster_label)
40+
.addTemplate('namespace', 'loki_build_info{' + $._config.per_cluster_label + '=~"$cluster"}', 'namespace')
41+
+ {
42+
// This dashboard uses the new grid system in order to place panels (using gridPos).
43+
// Because of this we can't use the mixin's addRow() and addPanel().
44+
schemaVersion: 27,
45+
rows: null,
46+
// ugly hack, copy pasta the tag/link
47+
// code from the loki-mixin
48+
tags: $._config.tags,
49+
links: [
50+
{
51+
asDropdown: true,
52+
icon: 'external link',
53+
includeVars: true,
54+
keepTime: true,
55+
tags: $._config.tags,
56+
targetBlank: false,
57+
title: 'Loki Dashboards',
58+
type: 'dashboards',
59+
},
60+
],
61+
panels: [
62+
{ type: 'row', title: 'Headline' },
63+
dashboard.panel('Num Worker Replicas') +
64+
dashboard.newStatPanel('count(loki_build_info{%s})' % $.jobMatcher('compactor-worker'), instant=true, unit='short', decimals=null, thresholds=[
65+
{ color: 'red', value: null },
66+
{ color: 'green', value: 1 },
67+
]) +
68+
{ gridPos: { h: 3, w: 6, x: 0, y: 1 } },
69+
70+
dashboard.panel('Connected Workers') +
71+
$.newStatPanel('count(loki_compactor_worker_connected_to_compactor{%s} == 1) / count(loki_build_info{%s})' % [$.jobMatcher('compactor-worker'), $.jobMatcher('compactor-worker')], instant=true, unit='percentunit', decimals=null, thresholds=[
72+
{ color: 'red', value: null },
73+
{ color: 'green', value: 1 },
74+
]) +
75+
{ gridPos: { h: 3, w: 6, x: 6, y: 1 } },
76+
77+
dashboard.panel('Num Manifests left to process') +
78+
dashboard.newStatPanel('loki_compactor_job_builder_num_manifests_left_to_process{%s}' % $.jobMatcher('compactor'), unit='short', decimals=null, thresholds=[
79+
{ color: 'green', value: null },
80+
]) +
81+
{ gridPos: { h: 3, w: 6, x: 12, y: 1 } },
82+
83+
dashboard.panel('Num Segments left to process') +
84+
dashboard.newStatPanel('loki_compactor_job_builder_num_segments_left_to_process{%s}' % $.jobMatcher('compactor'), unit='short', decimals=null, thresholds=[
85+
{ color: 'green', value: null },
86+
]) +
87+
{ gridPos: { h: 3, w: 6, x: 18, y: 1 } },
88+
89+
{
90+
type: 'row',
91+
title: 'Pending Delete Requests',
92+
collapsed: true,
93+
gridPos: { h: 1, w: 24, x: 0, y: 4 },
94+
panels: [
95+
dashboard.panel('Number of Pending Requests') +
96+
dashboard.newStatPanel('sum(loki_compactor_pending_delete_requests_count{%s})' % $.namespaceMatcher(), unit='short', decimals=null) +
97+
{ gridPos: { h: 4, w: 12, x: 0, y: 5 } },
98+
99+
dashboard.panel('Oldest Pending Request Age') +
100+
dashboard.newStatPanel('max(loki_compactor_oldest_pending_delete_request_age_seconds{%s})' % $.namespaceMatcher(), unit='dtdurations', decimals=null) +
101+
{ gridPos: { h: 4, w: 12, x: 12, y: 5 } },
102+
],
103+
},
104+
105+
{ type: 'row', title: 'Worker Resource Usage', gridPos: { h: 1, w: 24, x: 0, y: 5 } },
106+
$.CPUUsagePanel('CPU', worker_pod_matcher) + { gridPos: { h: 7, w: 8, x: 0, y: 6 } },
107+
$.memoryWorkingSetPanel('Memory (workingset)', worker_pod_matcher) + { gridPos: { h: 7, w: 8, x: 8, y: 6 } },
108+
$.goHeapInUsePanel('Memory (go heap inuse)', worker_job_matcher) + { gridPos: { h: 7, w: 8, x: 16, y: 6 } },
109+
110+
{
111+
type: 'row',
112+
title: 'Compactor Resource Usage',
113+
collapsed: true,
114+
gridPos: { h: 1, w: 24, x: 0, y: 13 },
115+
panels: [
116+
$.CPUUsagePanel('CPU', compactor_pod_matcher) + { gridPos: { h: 7, w: 8, x: 0, y: 14 } },
117+
$.memoryWorkingSetPanel('Memory (workingset)', compactor_pod_matcher) + { gridPos: { h: 7, w: 8, x: 8, y: 14 } },
118+
$.goHeapInUsePanel('Memory (go heap inuse)', compactor_job_matcher) + { gridPos: { h: 7, w: 8, x: 16, y: 14 } },
119+
],
120+
},
121+
122+
{ type: 'row', title: 'Manifest building', gridPos: { h: 1, w: 24, x: 0, y: 14 } },
123+
dashboard.panel('Manifest build attempts') +
124+
dashboard.queryPanel(['sum by (status) (increase(loki_compactor_manifest_build_attempts_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], ['{{status}}']) +
125+
{ gridPos: { h: 7, w: 12, x: 0, y: 15 } },
126+
dashboard.panel('Chunks added to manifest for processing') +
127+
dashboard.queryPanel(['sum by (status) (increase(loki_compactor_manifest_chunks_selected_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], 'chunks count') +
128+
{ gridPos: { h: 7, w: 12, x: 12, y: 15 } },
129+
130+
{ type: 'row', title: 'Jobs', gridPos: { h: 1, w: 24, x: 0, y: 22 } },
131+
dashboard.panel('Rate of jobs sent to worker for processing') +
132+
dashboard.queryPanel(['sum(rate(loki_compactor_jobs_queued_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], 'job creation rate') +
133+
{ gridPos: { h: 7, w: 12, x: 0, y: 23 } },
134+
dashboard.panel('Rate of job processed by status') +
135+
dashboard.queryPanel(['sum by (status) (rate(loki_compactor_worker_jobs_processed_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor-worker')], ['{{status}}']) +
136+
{ gridPos: { h: 7, w: 12, x: 12, y: 23 } },
137+
dashboard.panel('Rate of job retries by reason') +
138+
dashboard.queryPanel(['sum by (reason) (rate(loki_compactor_job_retries_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], ['{{reason}}']) +
139+
{ gridPos: { h: 7, w: 12, x: 0, y: 30 } },
140+
dashboard.panel('Rate of dropped jobs due to running out of max attempts') +
141+
dashboard.queryPanel(['sum(rate(loki_compactor_jobs_dropped_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], 'jobs drop rate') +
142+
{ gridPos: { h: 7, w: 12, x: 12, y: 30 } },
143+
dashboard.panel('Latency in processing of jobs') +
144+
dashboard.queryPanel([
145+
'histogram_quantile(0.99, sum by(le) (rate(loki_compactor_jobs_processing_duration_seconds_bucket{%s}[$__rate_interval])))' % $.jobMatcher('compactor'),
146+
'histogram_quantile(0.50, sum by(le) (rate(loki_compactor_jobs_processing_duration_seconds_bucket{%s}[$__rate_interval])))' % $.jobMatcher('compactor'),
147+
], ['p95', 'p50']) +
148+
{ gridPos: { h: 7, w: 12, x: 0, y: 37 } },
149+
dashboard.panel('Rate of dropped jobs due to running out of max attempts') +
150+
dashboard.queryPanel(['sum(rate(loki_compactor_jobs_dropped_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], 'jobs drop rate') +
151+
{ gridPos: { h: 7, w: 12, x: 12, y: 37 } },
152+
dashboard.panel('Rate of lines deleted / sec') +
153+
dashboard.queryPanel(['sum(rate(loki_compactor_deletion_job_runner_deleted_lines_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor-worker')], 'log lines deletion rate') +
154+
{ gridPos: { h: 7, w: 24, x: 0, y: 44 } },
155+
156+
{ type: 'row', title: 'Manifest Processing', gridPos: { h: 1, w: 24, x: 0, y: 51 } },
157+
dashboard.panel('Rate in failures at various manifest processing stages') +
158+
dashboard.queryPanel(['sum by (stage) (rate(loki_compactor_process_manifest_failures_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], ['{{stage}}']) +
159+
{ gridPos: { h: 7, w: 12, x: 0, y: 52 } },
160+
dashboard.panel('Storage updates applied') +
161+
dashboard.queryPanel(['sum by (type) (rate(loki_compactor_deletion_storage_updates_applied_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], ['{{type}}']) +
162+
{ gridPos: { h: 7, w: 12, x: 12, y: 52 } },
163+
164+
{ type: 'row', title: 'Logs', gridPos: { h: 1, w: 24, x: 0, y: 59 } },
165+
$.logPanel('Worker Logs', '{%s}' % $.jobMatcher(worker_job_matcher)) +
166+
{ gridPos: { h: 7, w: 24, x: 0, y: 60 } },
167+
$.logPanel('Compactor Logs', '{%s} != "OpenTelemetry" != "no marks file" != "compact" != "count=" != "ingester" != "skipping upload" != "retention" != "memberlist" != "uploading delete requests db"' % $.jobMatcher(compactor_job_matcher)) +
168+
{ gridPos: { h: 7, w: 24, x: 0, y: 67 } },
169+
],
170+
},
171+
},
172+
}

0 commit comments

Comments
 (0)