|
| 1 | +local vendor_config = import 'github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet'; |
| 2 | +local vendor_utils = import 'github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet'; |
| 3 | +local g = import 'grafana-builder/grafana.libsonnet'; |
| 4 | +local grafana = import 'grafonnet/grafana.libsonnet'; |
| 5 | + |
| 6 | +{ |
| 7 | + local worker_pod_matcher = if $._config.meta_monitoring.enabled |
| 8 | + then $._config.per_instance_label + '=~".*compactor-worker.*"' |
| 9 | + else 'container="compactor-worker"', |
| 10 | + local worker_job_matcher = if $._config.meta_monitoring.enabled |
| 11 | + then '".*compactor-worker"' % $._config.ssd.pod_prefix_matcher |
| 12 | + else 'compactor-worker', |
| 13 | + |
| 14 | + local compactor_pod_matcher = if $._config.meta_monitoring.enabled |
| 15 | + then $._config.per_instance_label + '=~"(.*compactor.*|%s-backend.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher |
| 16 | + else 'container="compactor"', |
| 17 | + local compactor_job_matcher = if $._config.meta_monitoring.enabled |
| 18 | + then '"(.*compactor|%s-backend.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher |
| 19 | + else 'compactor', |
| 20 | + |
| 21 | + _config+:: { |
| 22 | + horizontally_scalable_compactor_enabled: false, |
| 23 | + }, |
| 24 | + grafanaDashboards+: if !$._config.horizontally_scalable_compactor_enabled then {} else { |
| 25 | + local dashboard = ( |
| 26 | + vendor_utils { |
| 27 | + _config:: vendor_config._config + $._config { |
| 28 | + product: 'Loki', |
| 29 | + dashboard_prefix: 'Loki / ', |
| 30 | + tags: ['loki'], |
| 31 | + }, |
| 32 | + } |
| 33 | + ), |
| 34 | + 'loki-deletion-horizontally-scalable.json': |
| 35 | + // The dashboard() function automatically adds the "Loki / " prefix to the dashboard title. |
| 36 | + // This logic is inherited from mimir-mixin. |
| 37 | + dashboard.dashboard('Deletion(Horizontally Scalable)') |
| 38 | + // We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality. |
| 39 | + .addTemplate('cluster', 'loki_build_info', $._config.per_cluster_label) |
| 40 | + .addTemplate('namespace', 'loki_build_info{' + $._config.per_cluster_label + '=~"$cluster"}', 'namespace') |
| 41 | + + { |
| 42 | + // This dashboard uses the new grid system in order to place panels (using gridPos). |
| 43 | + // Because of this we can't use the mixin's addRow() and addPanel(). |
| 44 | + schemaVersion: 27, |
| 45 | + rows: null, |
| 46 | + // ugly hack, copy pasta the tag/link |
| 47 | + // code from the loki-mixin |
| 48 | + tags: $._config.tags, |
| 49 | + links: [ |
| 50 | + { |
| 51 | + asDropdown: true, |
| 52 | + icon: 'external link', |
| 53 | + includeVars: true, |
| 54 | + keepTime: true, |
| 55 | + tags: $._config.tags, |
| 56 | + targetBlank: false, |
| 57 | + title: 'Loki Dashboards', |
| 58 | + type: 'dashboards', |
| 59 | + }, |
| 60 | + ], |
| 61 | + panels: [ |
| 62 | + { type: 'row', title: 'Headline' }, |
| 63 | + dashboard.panel('Num Worker Replicas') + |
| 64 | + dashboard.newStatPanel('count(loki_build_info{%s})' % $.jobMatcher('compactor-worker'), instant=true, unit='short', decimals=null, thresholds=[ |
| 65 | + { color: 'red', value: null }, |
| 66 | + { color: 'green', value: 1 }, |
| 67 | + ]) + |
| 68 | + { gridPos: { h: 3, w: 6, x: 0, y: 1 } }, |
| 69 | + |
| 70 | + dashboard.panel('Connected Workers') + |
| 71 | + $.newStatPanel('count(loki_compactor_worker_connected_to_compactor{%s} == 1) / count(loki_build_info{%s})' % [$.jobMatcher('compactor-worker'), $.jobMatcher('compactor-worker')], instant=true, unit='percentunit', decimals=null, thresholds=[ |
| 72 | + { color: 'red', value: null }, |
| 73 | + { color: 'green', value: 1 }, |
| 74 | + ]) + |
| 75 | + { gridPos: { h: 3, w: 6, x: 6, y: 1 } }, |
| 76 | + |
| 77 | + dashboard.panel('Num Manifests left to process') + |
| 78 | + dashboard.newStatPanel('loki_compactor_job_builder_num_manifests_left_to_process{%s}' % $.jobMatcher('compactor'), unit='short', decimals=null, thresholds=[ |
| 79 | + { color: 'green', value: null }, |
| 80 | + ]) + |
| 81 | + { gridPos: { h: 3, w: 6, x: 12, y: 1 } }, |
| 82 | + |
| 83 | + dashboard.panel('Num Segments left to process') + |
| 84 | + dashboard.newStatPanel('loki_compactor_job_builder_num_segments_left_to_process{%s}' % $.jobMatcher('compactor'), unit='short', decimals=null, thresholds=[ |
| 85 | + { color: 'green', value: null }, |
| 86 | + ]) + |
| 87 | + { gridPos: { h: 3, w: 6, x: 18, y: 1 } }, |
| 88 | + |
| 89 | + { |
| 90 | + type: 'row', |
| 91 | + title: 'Pending Delete Requests', |
| 92 | + collapsed: true, |
| 93 | + gridPos: { h: 1, w: 24, x: 0, y: 4 }, |
| 94 | + panels: [ |
| 95 | + dashboard.panel('Number of Pending Requests') + |
| 96 | + dashboard.newStatPanel('sum(loki_compactor_pending_delete_requests_count{%s})' % $.namespaceMatcher(), unit='short', decimals=null) + |
| 97 | + { gridPos: { h: 4, w: 12, x: 0, y: 5 } }, |
| 98 | + |
| 99 | + dashboard.panel('Oldest Pending Request Age') + |
| 100 | + dashboard.newStatPanel('max(loki_compactor_oldest_pending_delete_request_age_seconds{%s})' % $.namespaceMatcher(), unit='dtdurations', decimals=null) + |
| 101 | + { gridPos: { h: 4, w: 12, x: 12, y: 5 } }, |
| 102 | + ], |
| 103 | + }, |
| 104 | + |
| 105 | + { type: 'row', title: 'Worker Resource Usage', gridPos: { h: 1, w: 24, x: 0, y: 5 } }, |
| 106 | + $.CPUUsagePanel('CPU', worker_pod_matcher) + { gridPos: { h: 7, w: 8, x: 0, y: 6 } }, |
| 107 | + $.memoryWorkingSetPanel('Memory (workingset)', worker_pod_matcher) + { gridPos: { h: 7, w: 8, x: 8, y: 6 } }, |
| 108 | + $.goHeapInUsePanel('Memory (go heap inuse)', worker_job_matcher) + { gridPos: { h: 7, w: 8, x: 16, y: 6 } }, |
| 109 | + |
| 110 | + { |
| 111 | + type: 'row', |
| 112 | + title: 'Compactor Resource Usage', |
| 113 | + collapsed: true, |
| 114 | + gridPos: { h: 1, w: 24, x: 0, y: 13 }, |
| 115 | + panels: [ |
| 116 | + $.CPUUsagePanel('CPU', compactor_pod_matcher) + { gridPos: { h: 7, w: 8, x: 0, y: 14 } }, |
| 117 | + $.memoryWorkingSetPanel('Memory (workingset)', compactor_pod_matcher) + { gridPos: { h: 7, w: 8, x: 8, y: 14 } }, |
| 118 | + $.goHeapInUsePanel('Memory (go heap inuse)', compactor_job_matcher) + { gridPos: { h: 7, w: 8, x: 16, y: 14 } }, |
| 119 | + ], |
| 120 | + }, |
| 121 | + |
| 122 | + { type: 'row', title: 'Manifest building', gridPos: { h: 1, w: 24, x: 0, y: 14 } }, |
| 123 | + dashboard.panel('Manifest build attempts') + |
| 124 | + dashboard.queryPanel(['sum by (status) (increase(loki_compactor_manifest_build_attempts_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], ['{{status}}']) + |
| 125 | + { gridPos: { h: 7, w: 12, x: 0, y: 15 } }, |
| 126 | + dashboard.panel('Chunks added to manifest for processing') + |
| 127 | + dashboard.queryPanel(['sum by (status) (increase(loki_compactor_manifest_chunks_selected_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], 'chunks count') + |
| 128 | + { gridPos: { h: 7, w: 12, x: 12, y: 15 } }, |
| 129 | + |
| 130 | + { type: 'row', title: 'Jobs', gridPos: { h: 1, w: 24, x: 0, y: 22 } }, |
| 131 | + dashboard.panel('Rate of jobs sent to worker for processing') + |
| 132 | + dashboard.queryPanel(['sum(rate(loki_compactor_jobs_queued_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], 'job creation rate') + |
| 133 | + { gridPos: { h: 7, w: 12, x: 0, y: 23 } }, |
| 134 | + dashboard.panel('Rate of job processed by status') + |
| 135 | + dashboard.queryPanel(['sum by (status) (rate(loki_compactor_worker_jobs_processed_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor-worker')], ['{{status}}']) + |
| 136 | + { gridPos: { h: 7, w: 12, x: 12, y: 23 } }, |
| 137 | + dashboard.panel('Rate of job retries by reason') + |
| 138 | + dashboard.queryPanel(['sum by (reason) (rate(loki_compactor_job_retries_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], ['{{reason}}']) + |
| 139 | + { gridPos: { h: 7, w: 12, x: 0, y: 30 } }, |
| 140 | + dashboard.panel('Rate of dropped jobs due to running out of max attempts') + |
| 141 | + dashboard.queryPanel(['sum(rate(loki_compactor_jobs_dropped_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], 'jobs drop rate') + |
| 142 | + { gridPos: { h: 7, w: 12, x: 12, y: 30 } }, |
| 143 | + dashboard.panel('Latency in processing of jobs') + |
| 144 | + dashboard.queryPanel([ |
| 145 | + 'histogram_quantile(0.99, sum by(le) (rate(loki_compactor_jobs_processing_duration_seconds_bucket{%s}[$__rate_interval])))' % $.jobMatcher('compactor'), |
| 146 | + 'histogram_quantile(0.50, sum by(le) (rate(loki_compactor_jobs_processing_duration_seconds_bucket{%s}[$__rate_interval])))' % $.jobMatcher('compactor'), |
| 147 | + ], ['p95', 'p50']) + |
| 148 | + { gridPos: { h: 7, w: 12, x: 0, y: 37 } }, |
| 149 | + dashboard.panel('Rate of dropped jobs due to running out of max attempts') + |
| 150 | + dashboard.queryPanel(['sum(rate(loki_compactor_jobs_dropped_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], 'jobs drop rate') + |
| 151 | + { gridPos: { h: 7, w: 12, x: 12, y: 37 } }, |
| 152 | + dashboard.panel('Rate of lines deleted / sec') + |
| 153 | + dashboard.queryPanel(['sum(rate(loki_compactor_deletion_job_runner_deleted_lines_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor-worker')], 'log lines deletion rate') + |
| 154 | + { gridPos: { h: 7, w: 24, x: 0, y: 44 } }, |
| 155 | + |
| 156 | + { type: 'row', title: 'Manifest Processing', gridPos: { h: 1, w: 24, x: 0, y: 51 } }, |
| 157 | + dashboard.panel('Rate in failures at various manifest processing stages') + |
| 158 | + dashboard.queryPanel(['sum by (stage) (rate(loki_compactor_process_manifest_failures_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], ['{{stage}}']) + |
| 159 | + { gridPos: { h: 7, w: 12, x: 0, y: 52 } }, |
| 160 | + dashboard.panel('Storage updates applied') + |
| 161 | + dashboard.queryPanel(['sum by (type) (rate(loki_compactor_deletion_storage_updates_applied_total{%s}[$__rate_interval]))' % $.jobMatcher('compactor')], ['{{type}}']) + |
| 162 | + { gridPos: { h: 7, w: 12, x: 12, y: 52 } }, |
| 163 | + |
| 164 | + { type: 'row', title: 'Logs', gridPos: { h: 1, w: 24, x: 0, y: 59 } }, |
| 165 | + $.logPanel('Worker Logs', '{%s}' % $.jobMatcher(worker_job_matcher)) + |
| 166 | + { gridPos: { h: 7, w: 24, x: 0, y: 60 } }, |
| 167 | + $.logPanel('Compactor Logs', '{%s} != "OpenTelemetry" != "no marks file" != "compact" != "count=" != "ingester" != "skipping upload" != "retention" != "memberlist" != "uploading delete requests db"' % $.jobMatcher(compactor_job_matcher)) + |
| 168 | + { gridPos: { h: 7, w: 24, x: 0, y: 67 } }, |
| 169 | + ], |
| 170 | + }, |
| 171 | + }, |
| 172 | +} |
0 commit comments