diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bdc8e26b14..10f52af4ef 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -193,6 +193,8 @@ image_prometheus: .dkr.ecr..amazonaws.com/cortexlabs/prometh image_prometheus_config_reloader: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-config-reloader:master image_prometheus_operator: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-statsd-exporter:master +image_prometheus_dcgm_exporter: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-dcgm-exporter:master +image_prometheus_kube_state_metrics_exporter: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-kube-state-metrics-exporter:master image_prometheus_node_exporter: .dkr.ecr..amazonaws.com/cortexlabs/prometheus-node-exporter:master image_kube_rbac_proxy: .dkr.ecr..amazonaws.com/cortexlabs/kube-rbac-proxy:master image_grafana: .dkr.ecr..amazonaws.com/cortexlabs/grafana:master @@ -224,6 +226,8 @@ image_prometheus: gcr.io//cortexlabs/prometheus:master image_prometheus_config_reloader: gcr.io//cortexlabs/prometheus-config-reloader:master image_prometheus_operator: gcr.io//cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: gcr.io//cortexlabs/prometheus-statsd-exporter:master +image_prometheus_dcgm_exporter: gcr.io//cortexlabs/prometheus-dcgm-exporter:master +image_prometheus_kube_state_metrics_exporter: gcr.io//cortexlabs/prometheus-kube-state-metrics-exporter:master image_prometheus_node_exporter: gcr.io//cortexlabs/prometheus-node-exporter:master image_kube_rbac_proxy: gcr.io//cortexlabs/kube-rbac-proxy:master image_grafana: gcr.io//cortexlabs/grafana:master diff --git a/build/images.sh b/build/images.sh index 95cbd7fc3e..6b3742e711 100644 --- a/build/images.sh +++ b/build/images.sh @@ -58,6 +58,8 @@ non_dev_images_cluster=( "prometheus-config-reloader" "prometheus-operator" "prometheus-statsd-exporter" + "prometheus-dcgm-exporter" + "prometheus-kube-state-metrics" "prometheus-node-exporter" "kube-rbac-proxy" "grafana" diff --git a/charts/dashboards/batch.json b/charts/dashboards/batch.json index a3ed008a91..565107bc73 100644 --- a/charts/dashboards/batch.json +++ b/charts/dashboards/batch.json @@ -15,6 +15,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, + "iteration": 1614622843373, "links": [], "panels": [ { @@ -36,13 +37,25 @@ "content": "

BatchAPI

\n", "mode": "markdown" }, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "timeFrom": null, "timeShift": null, "title": "", "transparent": true, "type": "text" }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 22, + "title": "API Stats", + "type": "row" + }, { "aliasColors": {}, "bars": false, @@ -62,7 +75,7 @@ "h": 9, "w": 12, "x": 0, - "y": 2 + "y": 3 }, "hiddenSeries": false, "id": 2, @@ -83,7 +96,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -162,7 +175,7 @@ "h": 9, "w": 12, "x": 12, - "y": 2 + "y": 3 }, "hiddenSeries": false, "id": 3, @@ -183,7 +196,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -219,6 +232,7 @@ }, "yaxes": [ { + "$$hashKey": "object:262", "decimals": 0, "format": "short", "label": null, @@ -228,6 +242,7 @@ "show": true }, { + "$$hashKey": "object:263", "format": "short", "label": null, "logBase": 1, @@ -259,8 +274,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 6, - "y": 11 + "x": 0, + "y": 12 }, "hiddenSeries": false, "id": 5, @@ -281,7 +296,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -337,6 +352,1078 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": { + "Active Jobs": "semi-dark-green", + "Active Workers": "semi-dark-orange" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Active jobs/workers", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"})", + "interval": "", + "legendFormat": "Active Jobs", + "refId": "Active Batches" + }, + { + "expr": "sum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Active Workers", + "refId": "Active Workers" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "# Active Jobs/Workers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:128", + "decimals": 0, + "format": "count", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:129", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 11, + "title": "Aggregate Worker Usage", + "type": "row" + }, + { + "aliasColors": { + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total CPU usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total CPU Usage", + "refId": "CPU Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total memory usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 21 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Memory Usage", + "refId": "Memory Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"$api_name.+\"}) / 1024^2", + "hide": false, + "interval": "", + "legendFormat": "Total Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Usage": "semi-dark-green", + "Total GPU Utilization": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU core usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU memory usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 29 + }, + "hiddenSeries": false, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 9, + "panels": [], + "title": "Avg Worker Usage", + "type": "row" + }, + { + "aliasColors": { + "Avg CPU Request": "semi-dark-orange", + "Avg CPU Usage": "semi-dark-green", + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg CPU usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg CPU Usage", + "refId": "CPU Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"$api_name.+\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Memory Request": "semi-dark-orange", + "Avg Memory Usage": "semi-dark-green", + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg memory usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Memory Usage", + "refId": "Memory Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"$api_name.+\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg GPU Capacity": "semi-dark-orange", + "Avg GPU Usage": "semi-dark-green", + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Usage": "semi-dark-green", + "Total GPU Utilization": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU core usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 46 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Avg GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Avg GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Capacity GPU Memory": "semi-dark-orange", + "Avg Used GPU Memory": "semi-dark-green", + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU memory usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 46 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "refresh": "30s", @@ -362,7 +1449,7 @@ "query": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)", "refId": "StandardVariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, diff --git a/charts/dashboards/realtime.json b/charts/dashboards/realtime.json index d4ba134527..6ac9f89d5c 100644 --- a/charts/dashboards/realtime.json +++ b/charts/dashboards/realtime.json @@ -15,8 +15,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 10, - "iteration": 1612793050833, + "iteration": 1614624509947, "links": [], "panels": [ { @@ -38,13 +37,27 @@ "content": "

RealtimeAPI

", "mode": "markdown" }, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "timeFrom": null, "timeShift": null, "title": "", "transparent": true, "type": "text" }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 22, + "panels": [], + "title": "API Stats", + "type": "row" + }, { "aliasColors": {}, "bars": false, @@ -64,7 +77,7 @@ "h": 9, "w": 12, "x": 0, - "y": 2 + "y": 3 }, "hiddenSeries": false, "id": 2, @@ -86,7 +99,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -178,7 +191,7 @@ "h": 9, "w": 12, "x": 12, - "y": 2 + "y": 3 }, "hiddenSeries": false, "id": 4, @@ -199,7 +212,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -276,7 +289,7 @@ "h": 9, "w": 12, "x": 0, - "y": 11 + "y": 12 }, "hiddenSeries": false, "id": 8, @@ -298,7 +311,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -350,6 +363,7 @@ }, "yaxes": [ { + "$$hashKey": "object:1217", "format": "reqps", "label": null, "logBase": 1, @@ -358,6 +372,7 @@ "show": true }, { + "$$hashKey": "object:1218", "format": "short", "label": null, "logBase": 1, @@ -389,7 +404,7 @@ "h": 9, "w": 12, "x": 12, - "y": 11 + "y": 12 }, "hiddenSeries": false, "id": 7, @@ -410,7 +425,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -423,7 +438,7 @@ "expr": "count(cortex_in_flight_requests{api_name=~\"$api_name\"}) by (api_name)", "interval": "", "legendFormat": "{{api_name}}", - "refId": "A" + "refId": "Active Replicas" } ], "thresholds": [], @@ -446,6 +461,7 @@ }, "yaxes": [ { + "$$hashKey": "object:236", "decimals": 0, "format": "short", "label": null, @@ -455,6 +471,7 @@ "show": true }, { + "$$hashKey": "object:237", "format": "short", "label": null, "logBase": 1, @@ -487,7 +504,7 @@ "h": 9, "w": 12, "x": 0, - "y": 20 + "y": 21 }, "hiddenSeries": false, "id": 9, @@ -509,7 +526,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -601,7 +618,7 @@ "h": 9, "w": 12, "x": 12, - "y": 20 + "y": 21 }, "hiddenSeries": false, "id": 10, @@ -623,7 +640,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -715,7 +732,7 @@ "h": 9, "w": 12, "x": 0, - "y": 29 + "y": 30 }, "hiddenSeries": false, "id": 6, @@ -736,7 +753,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -788,6 +805,7 @@ }, "yaxes": [ { + "$$hashKey": "object:1302", "format": "ms", "label": null, "logBase": 1, @@ -796,6 +814,7 @@ "show": true }, { + "$$hashKey": "object:1303", "format": "short", "label": null, "logBase": 1, @@ -828,7 +847,7 @@ "h": 9, "w": 12, "x": 12, - "y": 29 + "y": 30 }, "hiddenSeries": false, "id": 11, @@ -849,7 +868,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -924,7 +943,9 @@ } }, { - "aliasColors": {}, + "aliasColors": { + "iris-classifier": "light-green" + }, "bars": false, "dashLength": 10, "dashes": false, @@ -942,7 +963,7 @@ "h": 9, "w": 12, "x": 0, - "y": 38 + "y": 39 }, "hiddenSeries": false, "id": 16, @@ -963,7 +984,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1056,7 +1077,7 @@ "h": 9, "w": 12, "x": 12, - "y": 38 + "y": 39 }, "hiddenSeries": false, "id": 12, @@ -1077,7 +1098,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1150,6 +1171,967 @@ "align": false, "alignLevel": null } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 20, + "panels": [], + "title": "Aggregate Usage", + "type": "row" + }, + { + "aliasColors": { + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total CPU usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 49 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total CPU Usage", + "refId": "CPU Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total memory usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Memory Usage", + "refId": "Memory Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"api-$api_name.+\"}) / 1024^2", + "hide": false, + "interval": "", + "legendFormat": "Total Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Usage": "semi-dark-green", + "Total GPU Utilization": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU core usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU memory usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "hiddenSeries": false, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 18, + "panels": [], + "title": "Average Replica Usage", + "type": "row" + }, + { + "aliasColors": { + "Avg CPU Request": "semi-dark-orange", + "Avg CPU Usage": "semi-dark-green", + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg CPU usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 66 + }, + "hiddenSeries": false, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg CPU Usage", + "refId": "CPU Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"api-$api_name.+\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Memory Request": "semi-dark-orange", + "Avg Memory Usage": "semi-dark-green", + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg memory usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 66 + }, + "hiddenSeries": false, + "id": 31, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Memory Usage", + "refId": "Memory Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"api-$api_name.+\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg GPU Capacity": "semi-dark-orange", + "Avg GPU Usage": "semi-dark-green", + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Utilization": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU core usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 74 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))", + "hide": false, + "interval": "", + "legendFormat": "Avg GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Capacity GPU Memory": "semi-dark-orange", + "Avg Used GPU Memory": "semi-dark-green", + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU memory usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 74 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "refresh": "30s", @@ -1195,5 +2177,5 @@ "timezone": "", "title": "RealtimeAPI", "uid": "realtimeapi", - "version": 1 + "version": 4 } diff --git a/charts/templates/clusterconfig.yaml b/charts/templates/clusterconfig.yaml index 960d560a9e..c995388074 100644 --- a/charts/templates/clusterconfig.yaml +++ b/charts/templates/clusterconfig.yaml @@ -54,6 +54,7 @@ data: image_prometheus_config_reloader: {{ .Values.cortex.image_prometheus_config_reloader }} image_prometheus_operator: {{ .Values.cortex.image_prometheus_operator }} image_prometheus_statsd_exporter: {{ .Values.cortex.image_prometheus_statsd_exporter }} + image_prometheus_kube_state_metrics: {{ .Values.cortex.image_prometheus_kube_state_metrics }} image_prometheus_to_cloudwatch: {{ .Values.cortex.image_prometheus_to_cloudwatch }} --- {{- else if eq .Values.global.provider "gcp" }} @@ -85,6 +86,7 @@ data: image_prometheus_config_reloader: {{ .Values.cortex.image_prometheus_config_reloader }} image_prometheus_operator: {{ .Values.cortex.image_prometheus_operator }} image_prometheus_statsd_exporter: {{ .Values.cortex.image_prometheus_statsd_exporter }} + image_prometheus_kube_state_metrics: {{ .Values.cortex.image_prometheus_kube_state_metrics }} image_prometheus_stackdriver_sidecar: {{ .Values.cortex.image_prometheus_stackdriver_sidecar }} --- {{- end }} diff --git a/charts/templates/prometheus-kube-state-metrics.yaml b/charts/templates/prometheus-kube-state-metrics.yaml new file mode 100644 index 0000000000..5879869352 --- /dev/null +++ b/charts/templates/prometheus-kube-state-metrics.yaml @@ -0,0 +1,264 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + name: kube-state-metrics + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + name: kube-state-metrics +rules: + +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - deployments + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - endpoints + verbs: ["list", "watch"] + +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - replicasets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: {{ .Release.Namespace }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/version: "1.9.8" +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + replicas: 1 + template: + metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + spec: + hostNetwork: false + serviceAccountName: kube-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsUser: 65534 + containers: + - name: kube-state-metrics + resources: + requests: + cpu: 300m + memory: 400Mi + args: + - --collectors=certificatesigningrequests + - --collectors=configmaps + - --collectors=cronjobs + - --collectors=daemonsets + - --collectors=deployments + - --collectors=endpoints + - --collectors=horizontalpodautoscalers + - --collectors=ingresses + - --collectors=jobs + - --collectors=limitranges + - --collectors=mutatingwebhookconfigurations + - --collectors=namespaces + - --collectors=networkpolicies + - --collectors=nodes + - --collectors=persistentvolumeclaims + - --collectors=persistentvolumes + - --collectors=poddisruptionbudgets + - --collectors=pods + - --collectors=replicasets + - --collectors=replicationcontrollers + - --collectors=resourcequotas + - --collectors=secrets + - --collectors=services + - --collectors=statefulsets + - --collectors=storageclasses + - --collectors=validatingwebhookconfigurations + - --collectors=volumeattachments + - --telemetry-port=8081 + imagePullPolicy: Always + image: {{ .Values.cortex.image_prometheus_kube_state_metrics }} + ports: + - containerPort: 8080 + name: metrics + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 +--- +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: kube-state-metrics + namespace: {{ .Release.Namespace }} + labels: + name: kube-state-metrics + monitoring.cortex.dev: kube-state-metrics +spec: + jobLabel: "kube-state-metrics" + podMetricsEndpoints: + - port: metrics + scheme: http + path: /metrics + interval: 30s + namespaceSelector: + any: true + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics diff --git a/charts/templates/prometheus-kubelet-exporter.yaml b/charts/templates/prometheus-kubelet-exporter.yaml new file mode 100644 index 0000000000..8784c1e949 --- /dev/null +++ b/charts/templates/prometheus-kubelet-exporter.yaml @@ -0,0 +1,91 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: kubelet + monitoring.cortex.dev: kubelet-exporter + name: kubelet + namespace: {{ .Release.Namespace }} +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + honorTimestamps: false + interval: 30s + metricRelabelings: + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ + path: /metrics/cadvisor + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + path: /metrics/probes + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kubelet diff --git a/charts/templates/prometheus-monitoring.yaml b/charts/templates/prometheus-monitoring.yaml index 18a0a9e11b..1b211cb8bb 100644 --- a/charts/templates/prometheus-monitoring.yaml +++ b/charts/templates/prometheus-monitoring.yaml @@ -27,12 +27,12 @@ spec: matchExpressions: - key: "monitoring.cortex.dev" operator: "In" - values: [ "istio", "request-monitor", "statsd-exporter" ] + values: [ "istio", "request-monitor", "statsd-exporter", "dcgm-exporter", "kube-state-metrics" ] serviceMonitorSelector: matchExpressions: - key: "monitoring.cortex.dev" operator: "In" - values: [ "node-exporter" ] + values: [ "kubelet-exporter", "node-exporter" ] ruleSelector: matchLabels: prometheus: k8s diff --git a/charts/values.yaml b/charts/values.yaml index 2548d81d03..72492e2bef 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -28,6 +28,7 @@ cortex: image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master + image_prometheus_kube_state_metrics: quay.io/cortexlabs/prometheus-kube-state-metrics:master image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master image_grafana: quay.io/cortexlabs/grafana:master diff --git a/cli/cmd/lib_cluster_config_aws.go b/cli/cmd/lib_cluster_config_aws.go index 7c9e7a79c4..d30f59f3d9 100644 --- a/cli/cmd/lib_cluster_config_aws.go +++ b/cli/cmd/lib_cluster_config_aws.go @@ -424,6 +424,14 @@ func setConfigFieldsFromCached(userClusterConfig *clusterconfig.Config, cachedCl return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusStatsDExporterKey, cachedClusterConfig.ImagePrometheusStatsDExporter) } + if s.Obj(cachedClusterConfig.ImagePrometheusDCGMExporter) != s.Obj(userClusterConfig.ImagePrometheusDCGMExporter) { + return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusDCGMExporterKey, cachedClusterConfig.ImagePrometheusDCGMExporter) + } + + if s.Obj(cachedClusterConfig.ImagePrometheusKubeStateMetrics) != s.Obj(userClusterConfig.ImagePrometheusKubeStateMetrics) { + return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusKubeStateMetricsKey, cachedClusterConfig.ImagePrometheusKubeStateMetrics) + } + if s.Obj(cachedClusterConfig.ImagePrometheusNodeExporter) != s.Obj(userClusterConfig.ImagePrometheusNodeExporter) { return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusNodeExporterKey, cachedClusterConfig.ImagePrometheusNodeExporter) } @@ -748,6 +756,12 @@ func clusterConfigConfirmationStr(clusterConfig clusterconfig.Config) string { if clusterConfig.ImagePrometheusStatsDExporter != defaultConfig.ImagePrometheusStatsDExporter { items.Add(clusterconfig.ImagePrometheusStatsDExporterUserKey, clusterConfig.ImagePrometheusStatsDExporter) } + if clusterConfig.ImagePrometheusDCGMExporter != defaultConfig.ImagePrometheusDCGMExporter { + items.Add(clusterconfig.ImagePrometheusDCGMExporterUserKey, clusterConfig.ImagePrometheusDCGMExporter) + } + if clusterConfig.ImagePrometheusKubeStateMetrics != defaultConfig.ImagePrometheusKubeStateMetrics { + items.Add(clusterconfig.ImagePrometheusKubeStateMetricsUserKey, clusterConfig.ImagePrometheusKubeStateMetrics) + } if clusterConfig.ImageGrafana != defaultConfig.ImageGrafana { items.Add(clusterconfig.ImageGrafanaUserKey, clusterConfig.ImageGrafana) } diff --git a/dev/versions.md b/dev/versions.md index 902af66338..611f76033a 100644 --- a/dev/versions.md +++ b/dev/versions.md @@ -323,6 +323,22 @@ supported () 1. Update the base image version in `images/prometheus-statsd-exporter/Dockerfile`. 1. Update `prometheus-statsd-exporter.yaml` as necessary, if that's the case. +## Prometheus DCGM Exporter + +1. Run `helm template` on the DCGM charts https://github.com/NVIDIA/gpu-monitoring-tools/tree/master/deployment/dcgm-exporter and save the output somewhere temporarily. +1. Update the base image version in `images/prometheus-dcgm-exporter/Dockerfile`. +1. Update `prometheus-dcgm-exporter.yaml` as necessary, if that's the case. Keep in mind that in our k8s template, the `ServiceMonitor` was changed to a `PodMonitor`. Remove any unnecessary labels. + +## Prometheus kube-state-metrics Exporter + +1. Run `helm template` on the kube-state-metrics charts https://github.com/kubernetes/kube-state-metrics/tree/master/charts/kube-state-metrics and save the output somewhere temporarily. +1. Update the base image version in `images/prometheus-kube-state-metrics-exporter/Dockerfile`. +1. Update `prometheus-kube-state-metrics-exporter.yaml` as necessary, if that's the case. Keep in mind that in our k8s template, the `ServiceMonitor` was changed to a `PodMonitor`. Remove any unnecessary labels. The update can also include adjusting the resource requests. + +## Prometheus Kubelet Exporter + +1. Check if https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/kubernetes-serviceMonitorKubelet.yaml has changed when compared to `manager/manifests/prometheus-kubelet-exporter`. + ## Prometheus Node Exporter 1. Find the latest release in the Kube Prometheus [GitHub Repo](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/). diff --git a/docs/clusters/aws/install.md b/docs/clusters/aws/install.md index 1532f598aa..34df6935f7 100644 --- a/docs/clusters/aws/install.md +++ b/docs/clusters/aws/install.md @@ -108,6 +108,8 @@ image_prometheus: quay.io/cortexlabs/prometheus:master image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master +image_prometheus_dcgm_exporter: quay.io/cortexlabs/prometheus-dcgm-exporter:master +image_prometheus_kube_state_metrics_exporter: quay.io/cortexlabs/prometheus-kube-state-metrics-exporter:master image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master image_grafana: quay.io/cortexlabs/grafana:master diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md index be3e043b22..5c6294a701 100644 --- a/docs/clusters/gcp/install.md +++ b/docs/clusters/gcp/install.md @@ -82,6 +82,8 @@ image_prometheus: quay.io/cortexlabs/prometheus:master image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master +image_prometheus_dcgm_exporter: quay.io/cortexlabs/prometheus-dcgm-exporter:master +image_prometheus_kube_state_metrics_exporter: quay.io/cortexlabs/prometheus-kube-state-metrics-exporter:master image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master image_grafana: quay.io/cortexlabs/grafana:master diff --git a/images/prometheus-dcgm-exporter/Dockerfile b/images/prometheus-dcgm-exporter/Dockerfile new file mode 100644 index 0000000000..175a1a35d6 --- /dev/null +++ b/images/prometheus-dcgm-exporter/Dockerfile @@ -0,0 +1 @@ +FROM nvidia/dcgm-exporter:2.0.13-2.1.1-ubuntu18.04 diff --git a/images/prometheus-kube-state-metrics/Dockerfile b/images/prometheus-kube-state-metrics/Dockerfile new file mode 100644 index 0000000000..f0f37fb6f8 --- /dev/null +++ b/images/prometheus-kube-state-metrics/Dockerfile @@ -0,0 +1 @@ +FROM k8s.gcr.io/kube-state-metrics/kube-state-metrics:v1.9.8 diff --git a/manager/install.sh b/manager/install.sh index 5a9917f16c..087d09026b 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -74,6 +74,7 @@ function cluster_up_aws() { if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]]; then echo -n "○ configuring gpu support " envsubst < manifests/nvidia_aws.yaml | kubectl apply -f - >/dev/null + envsubst < manifests/prometheus-dcgm-exporter.yaml | kubectl apply -f - >/dev/null echo "✓" fi @@ -132,6 +133,7 @@ function cluster_up_gcp() { if [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then echo -n "○ configuring gpu support " envsubst < manifests/nvidia_gcp.yaml | kubectl apply -f - >/dev/null + envsubst < manifests/prometheus-dcgm-exporter.yaml | kubectl apply -f - >/dev/null echo "✓" fi @@ -303,6 +305,8 @@ function setup_secrets() { function setup_prometheus() { envsubst < manifests/prometheus-operator.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-statsd-exporter.yaml | kubectl apply -f - >/dev/null + envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null + envsubst < manifests/prometheus-kube-state-metrics.yaml | kubectl apply -f - >/dev/null envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-monitoring.yaml.j2 | kubectl apply -f - >/dev/null } diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml index 2624506a9e..f82fff3f21 100644 --- a/manager/manifests/grafana/grafana-dashboard-batch.yaml +++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml @@ -36,6 +36,7 @@ data: "editable": true, "gnetId": null, "graphTooltip": 0, + "iteration": 1614622843373, "links": [], "panels": [ { @@ -57,13 +58,25 @@ data: "content": "

BatchAPI

\n", "mode": "markdown" }, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "timeFrom": null, "timeShift": null, "title": "", "transparent": true, "type": "text" }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 22, + "title": "API Stats", + "type": "row" + }, { "aliasColors": {}, "bars": false, @@ -83,7 +96,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 2 + "y": 3 }, "hiddenSeries": false, "id": 2, @@ -104,7 +117,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -183,7 +196,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 2 + "y": 3 }, "hiddenSeries": false, "id": 3, @@ -204,7 +217,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -240,6 +253,7 @@ data: }, "yaxes": [ { + "$$hashKey": "object:262", "decimals": 0, "format": "short", "label": null, @@ -249,6 +263,7 @@ data: "show": true }, { + "$$hashKey": "object:263", "format": "short", "label": null, "logBase": 1, @@ -280,8 +295,8 @@ data: "gridPos": { "h": 8, "w": 12, - "x": 6, - "y": 11 + "x": 0, + "y": 12 }, "hiddenSeries": false, "id": 5, @@ -302,7 +317,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -358,6 +373,1078 @@ data: "align": false, "alignLevel": null } + }, + { + "aliasColors": { + "Active Jobs": "semi-dark-green", + "Active Workers": "semi-dark-orange" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Active jobs/workers", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"})", + "interval": "", + "legendFormat": "Active Jobs", + "refId": "Active Batches" + }, + { + "expr": "sum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Active Workers", + "refId": "Active Workers" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "# Active Jobs/Workers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:128", + "decimals": 0, + "format": "count", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:129", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 11, + "title": "Aggregate Worker Usage", + "type": "row" + }, + { + "aliasColors": { + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total CPU usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total CPU Usage", + "refId": "CPU Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total memory usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 21 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Memory Usage", + "refId": "Memory Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"$api_name.+\"}) / 1024^2", + "hide": false, + "interval": "", + "legendFormat": "Total Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Usage": "semi-dark-green", + "Total GPU Utilization": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU core usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU memory usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 29 + }, + "hiddenSeries": false, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 9, + "panels": [], + "title": "Avg Worker Usage", + "type": "row" + }, + { + "aliasColors": { + "Avg CPU Request": "semi-dark-orange", + "Avg CPU Usage": "semi-dark-green", + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg CPU usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg CPU Usage", + "refId": "CPU Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"$api_name.+\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Memory Request": "semi-dark-orange", + "Avg Memory Usage": "semi-dark-green", + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg memory usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Memory Usage", + "refId": "Memory Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"$api_name.+\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg GPU Capacity": "semi-dark-orange", + "Avg GPU Usage": "semi-dark-green", + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Usage": "semi-dark-green", + "Total GPU Utilization": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU core usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 46 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Avg GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Avg GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Capacity GPU Memory": "semi-dark-orange", + "Avg Used GPU Memory": "semi-dark-green", + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU memory usage across all workers of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 46 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "refresh": "30s", @@ -383,7 +1470,7 @@ data: "query": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)", "refId": "StandardVariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml index 41984d4120..1dbdfaca83 100644 --- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml +++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml @@ -36,8 +36,7 @@ data: "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 10, - "iteration": 1612793050833, + "iteration": 1614624509947, "links": [], "panels": [ { @@ -59,13 +58,27 @@ data: "content": "

RealtimeAPI

", "mode": "markdown" }, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "timeFrom": null, "timeShift": null, "title": "", "transparent": true, "type": "text" }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 22, + "panels": [], + "title": "API Stats", + "type": "row" + }, { "aliasColors": {}, "bars": false, @@ -85,7 +98,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 2 + "y": 3 }, "hiddenSeries": false, "id": 2, @@ -107,7 +120,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -199,7 +212,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 2 + "y": 3 }, "hiddenSeries": false, "id": 4, @@ -220,7 +233,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -297,7 +310,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 11 + "y": 12 }, "hiddenSeries": false, "id": 8, @@ -319,7 +332,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -371,6 +384,7 @@ data: }, "yaxes": [ { + "$$hashKey": "object:1217", "format": "reqps", "label": null, "logBase": 1, @@ -379,6 +393,7 @@ data: "show": true }, { + "$$hashKey": "object:1218", "format": "short", "label": null, "logBase": 1, @@ -410,7 +425,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 11 + "y": 12 }, "hiddenSeries": false, "id": 7, @@ -431,7 +446,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -444,7 +459,7 @@ data: "expr": "count(cortex_in_flight_requests{api_name=~\"$api_name\"}) by (api_name)", "interval": "", "legendFormat": "{{api_name}}", - "refId": "A" + "refId": "Active Replicas" } ], "thresholds": [], @@ -467,6 +482,7 @@ data: }, "yaxes": [ { + "$$hashKey": "object:236", "decimals": 0, "format": "short", "label": null, @@ -476,6 +492,7 @@ data: "show": true }, { + "$$hashKey": "object:237", "format": "short", "label": null, "logBase": 1, @@ -508,7 +525,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 20 + "y": 21 }, "hiddenSeries": false, "id": 9, @@ -530,7 +547,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -622,7 +639,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 20 + "y": 21 }, "hiddenSeries": false, "id": 10, @@ -644,7 +661,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -736,7 +753,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 29 + "y": 30 }, "hiddenSeries": false, "id": 6, @@ -757,7 +774,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -809,6 +826,7 @@ data: }, "yaxes": [ { + "$$hashKey": "object:1302", "format": "ms", "label": null, "logBase": 1, @@ -817,6 +835,7 @@ data: "show": true }, { + "$$hashKey": "object:1303", "format": "short", "label": null, "logBase": 1, @@ -849,7 +868,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 29 + "y": 30 }, "hiddenSeries": false, "id": 11, @@ -870,7 +889,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -945,7 +964,9 @@ data: } }, { - "aliasColors": {}, + "aliasColors": { + "iris-classifier": "light-green" + }, "bars": false, "dashLength": 10, "dashes": false, @@ -963,7 +984,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 38 + "y": 39 }, "hiddenSeries": false, "id": 16, @@ -984,7 +1005,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1077,7 +1098,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 38 + "y": 39 }, "hiddenSeries": false, "id": 12, @@ -1098,7 +1119,7 @@ data: "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.0", + "pluginVersion": "7.4.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1171,6 +1192,967 @@ data: "align": false, "alignLevel": null } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 20, + "panels": [], + "title": "Aggregate Usage", + "type": "row" + }, + { + "aliasColors": { + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total CPU usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 49 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total CPU Usage", + "refId": "CPU Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total memory usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Memory Usage", + "refId": "Memory Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"api-$api_name.+\"}) / 1024^2", + "hide": false, + "interval": "", + "legendFormat": "Total Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Usage": "semi-dark-green", + "Total GPU Utilization": "light-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU core usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total GPU memory usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "hiddenSeries": false, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Total Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Total Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 18, + "panels": [], + "title": "Average Replica Usage", + "type": "row" + }, + { + "aliasColors": { + "Avg CPU Request": "semi-dark-orange", + "Avg CPU Usage": "semi-dark-green", + "Total CPU Request": "semi-dark-orange", + "Total CPU Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg CPU usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 66 + }, + "hiddenSeries": false, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg CPU Usage", + "refId": "CPU Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"api-$api_name.+\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg CPU Request", + "refId": "CPU Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "core", + "label": "cpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Memory Request": "semi-dark-orange", + "Avg Memory Usage": "semi-dark-green", + "Total Memory Request": "semi-dark-orange", + "Total Memory Usage": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg memory usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 66 + }, + "hiddenSeries": false, + "id": 31, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": false, + "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Memory Usage", + "refId": "Memory Usage" + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"api-$api_name.+\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Memory Request", + "refId": "Memory Request" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg GPU Capacity": "semi-dark-orange", + "Avg GPU Usage": "semi-dark-green", + "Total GPU Capacity": "semi-dark-orange", + "Total GPU Utilization": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU core usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 74 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg GPU Usage", + "refId": "GPU Usage" + }, + { + "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))", + "hide": false, + "interval": "", + "legendFormat": "Avg GPU Capacity", + "refId": "GPU Capacity" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Core Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "gpuCore", + "label": "gpu", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Avg Capacity GPU Memory": "semi-dark-orange", + "Avg Used GPU Memory": "semi-dark-green", + "Total Capacity GPU Memory": "semi-dark-orange", + "Total Used GPU Memory": "semi-dark-green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Avg GPU memory usage across all replicas of the API", + "fieldConfig": { + "defaults": { + "color": {}, + "custom": {}, + "thresholds": { + "mode": "absolute", + "steps": [] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 74 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.4.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "hide": false, + "interval": "", + "legendFormat": "Avg Used GPU Memory", + "refId": "GPU Used Memory" + }, + { + "exemplar": false, + "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "Avg Capacity GPU Memory", + "refId": "GPU Capacity Memory" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Avg GPU Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1404", + "format": "MiB", + "label": "memory", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1405", + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "refresh": "30s", @@ -1216,5 +2198,5 @@ data: "timezone": "", "title": "RealtimeAPI", "uid": "realtimeapi", - "version": 1 + "version": 4 } diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml new file mode 100644 index 0000000000..27ec085d6f --- /dev/null +++ b/manager/manifests/prometheus-dcgm-exporter.yaml @@ -0,0 +1,136 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: dcgm-exporter + namespace: default + labels: + app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/instance: dcgm-exporter + app.kubernetes.io/component: dcgm-exporter +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: dcgm-exporter + namespace: default + labels: + app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/instance: dcgm-exporter + app.kubernetes.io/component: dcgm-exporter +spec: + updateStrategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/instance: dcgm-exporter + app.kubernetes.io/component: dcgm-exporter + template: + metadata: + labels: + app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/instance: dcgm-exporter + app.kubernetes.io/component: dcgm-exporter + spec: + serviceAccountName: dcgm-exporter + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu + operator: Exists + containers: + - env: + - name: DCGM_EXPORTER_LISTEN + value: :9400 + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + image: $CORTEX_IMAGE_PROMETHEUS_DCGM_EXPORTER + imagePullPolicy: Always + name: dcgm-exporter + ports: + - containerPort: 9400 + name: metrics + protocol: TCP + resources: + requests: + cpu: 50m + memory: 50Mi + securityContext: + privileged: true + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/lib/kubelet/pod-resources + name: pod-gpu-resources + readOnly: true + - mountPath: /usr/local/nvidia + name: nvidia-install-dir-host + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + tolerations: + - key: workload + effect: NoSchedule + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + volumes: + - hostPath: + path: /var/lib/kubelet/pod-resources + type: "" + name: pod-gpu-resources + - hostPath: + path: /home/kubernetes/bin/nvidia + type: "" + name: nvidia-install-dir-host +--- +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: dcgm-exporter + namespace: default + labels: + monitoring.cortex.dev: dcgm-exporter + app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/instance: dcgm-exporter + app.kubernetes.io/component: dcgm-exporter + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9400' +spec: + jobLabel: "dcgm-exporter" + podMetricsEndpoints: + - port: metrics + path: /metrics + scheme: http + interval: 15s + namespaceSelector: + any: true + selector: + matchLabels: + app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/instance: dcgm-exporter diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml new file mode 100644 index 0000000000..135e5ffdf8 --- /dev/null +++ b/manager/manifests/prometheus-kube-state-metrics.yaml @@ -0,0 +1,277 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + name: kube-state-metrics + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + name: kube-state-metrics +rules: + +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - deployments + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - endpoints + verbs: ["list", "watch"] + +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + +- apiGroups: ["extensions", "apps"] + resources: + - replicasets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: default + labels: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/version: "1.9.8" +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + replicas: 1 + template: + metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + spec: + hostNetwork: false + serviceAccountName: kube-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsUser: 65534 + containers: + - name: kube-state-metrics + resources: + requests: + cpu: 300m + memory: 400Mi + args: + - --collectors=certificatesigningrequests + - --collectors=configmaps + - --collectors=cronjobs + - --collectors=daemonsets + - --collectors=deployments + - --collectors=endpoints + - --collectors=horizontalpodautoscalers + - --collectors=ingresses + - --collectors=jobs + - --collectors=limitranges + - --collectors=mutatingwebhookconfigurations + - --collectors=namespaces + - --collectors=networkpolicies + - --collectors=nodes + - --collectors=persistentvolumeclaims + - --collectors=persistentvolumes + - --collectors=poddisruptionbudgets + - --collectors=pods + - --collectors=replicasets + - --collectors=replicationcontrollers + - --collectors=resourcequotas + - --collectors=secrets + - --collectors=services + - --collectors=statefulsets + - --collectors=storageclasses + - --collectors=validatingwebhookconfigurations + - --collectors=volumeattachments + - --telemetry-port=8081 + imagePullPolicy: Always + image: $CORTEX_IMAGE_PROMETHEUS_KUBE_STATE_METRICS + ports: + - containerPort: 8080 + name: metrics + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 +--- +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: kube-state-metrics + namespace: default + labels: + name: kube-state-metrics + monitoring.cortex.dev: kube-state-metrics +spec: + jobLabel: "kube-state-metrics" + podMetricsEndpoints: + - port: metrics + scheme: http + path: /metrics + interval: 30s + namespaceSelector: + any: true + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml new file mode 100644 index 0000000000..8677500ef9 --- /dev/null +++ b/manager/manifests/prometheus-kubelet-exporter.yaml @@ -0,0 +1,105 @@ +# Copyright 2021 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: kubelet + monitoring.cortex.dev: kubelet-exporter + name: kubelet + namespace: default +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + honorTimestamps: false + interval: 30s + metricRelabelings: + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ + path: /metrics/cadvisor + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + path: /metrics/probes + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kubelet diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2 index eb5b2d4c21..d10bf75cd9 100644 --- a/manager/manifests/prometheus-monitoring.yaml.j2 +++ b/manager/manifests/prometheus-monitoring.yaml.j2 @@ -40,12 +40,12 @@ spec: matchExpressions: - key: "monitoring.cortex.dev" operator: "In" - values: [ "istio", "request-monitor", "statsd-exporter" ] + values: [ "istio", "request-monitor", "statsd-exporter", "dcgm-exporter", "kube-state-metrics" ] serviceMonitorSelector: matchExpressions: - key: "monitoring.cortex.dev" operator: "In" - values: [ "node-exporter" ] + values: [ "kubelet-exporter", "node-exporter" ] ruleSelector: matchLabels: prometheus: k8s diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go index 0c6689fbe1..c1e8aacc68 100644 --- a/pkg/operator/operator/k8s.go +++ b/pkg/operator/operator/k8s.go @@ -207,13 +207,17 @@ func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume if api.Compute.Inf == 0 { if api.Compute.CPU != nil { userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy()) - userPodCPURequest.Sub(_requestMonitorCPURequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodCPURequest.Sub(_requestMonitorCPURequest) + } apiPodResourceList[kcore.ResourceCPU] = *userPodCPURequest } if api.Compute.Mem != nil { userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy()) - userPodMemRequest.Sub(_requestMonitorMemRequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodMemRequest.Sub(_requestMonitorMemRequest) + } apiPodResourceList[kcore.ResourceMemory] = *userPodMemRequest } @@ -236,7 +240,9 @@ func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume if api.Compute.CPU != nil { userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy()) - userPodCPURequest.Sub(_requestMonitorCPURequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodCPURequest.Sub(_requestMonitorCPURequest) + } q1, q2 := k8s.SplitInTwo(userPodCPURequest) apiPodResourceList[kcore.ResourceCPU] = *q1 neuronContainer.Resources.Requests[kcore.ResourceCPU] = *q2 @@ -244,7 +250,9 @@ func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume if api.Compute.Mem != nil { userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy()) - userPodMemRequest.Sub(_requestMonitorMemRequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodMemRequest.Sub(_requestMonitorMemRequest) + } q1, q2 := k8s.SplitInTwo(userPodMemRequest) apiPodResourceList[kcore.ResourceMemory] = *q1 neuronContainer.Resources.Requests[kcore.ResourceMemory] = *q2 @@ -304,7 +312,9 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo if api.Compute.Inf == 0 { if api.Compute.CPU != nil { userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy()) - userPodCPURequest.Sub(_requestMonitorCPURequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodCPURequest.Sub(_requestMonitorCPURequest) + } q1, q2 := k8s.SplitInTwo(userPodCPURequest) apiResourceList[kcore.ResourceCPU] = *q1 tfServingResourceList[kcore.ResourceCPU] = *q2 @@ -312,7 +322,9 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo if api.Compute.Mem != nil { userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy()) - userPodMemRequest.Sub(_requestMonitorMemRequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodMemRequest.Sub(_requestMonitorMemRequest) + } q1, q2 := k8s.SplitInTwo(userPodMemRequest) apiResourceList[kcore.ResourceMemory] = *q1 tfServingResourceList[kcore.ResourceMemory] = *q2 @@ -338,7 +350,9 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo if api.Compute.CPU != nil { userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy()) - userPodCPURequest.Sub(_requestMonitorCPURequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodCPURequest.Sub(_requestMonitorCPURequest) + } q1, q2, q3 := k8s.SplitInThree(userPodCPURequest) apiResourceList[kcore.ResourceCPU] = *q1 tfServingResourceList[kcore.ResourceCPU] = *q2 @@ -347,7 +361,9 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo if api.Compute.Mem != nil { userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy()) - userPodMemRequest.Sub(_requestMonitorMemRequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodMemRequest.Sub(_requestMonitorMemRequest) + } q1, q2, q3 := k8s.SplitInThree(userPodMemRequest) apiResourceList[kcore.ResourceMemory] = *q1 tfServingResourceList[kcore.ResourceMemory] = *q2 @@ -413,13 +429,17 @@ func ONNXPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume) if api.Compute.CPU != nil { userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy()) - userPodCPURequest.Sub(_requestMonitorCPURequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodCPURequest.Sub(_requestMonitorCPURequest) + } resourceList[kcore.ResourceCPU] = *userPodCPURequest } if api.Compute.Mem != nil { userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy()) - userPodMemRequest.Sub(_requestMonitorMemRequest) + if api.Kind == userconfig.RealtimeAPIKind { + userPodMemRequest.Sub(_requestMonitorMemRequest) + } resourceList[kcore.ResourceMemory] = *userPodMemRequest } @@ -958,6 +978,16 @@ func RequestMonitorContainer(api *spec.API) kcore.Container { image = config.GCPCoreConfig.ImageRequestMonitor } + requests := kcore.ResourceList{} + if api.Compute != nil { + if api.Compute.CPU != nil { + requests[kcore.ResourceCPU] = _requestMonitorCPURequest + } + if api.Compute.Mem != nil { + requests[kcore.ResourceMemory] = _requestMonitorMemRequest + } + } + return kcore.Container{ Name: _requestMonitorContainerName, Image: image, @@ -971,10 +1001,7 @@ func RequestMonitorContainer(api *spec.API) kcore.Container { VolumeMounts: defaultVolumeMounts(), ReadinessProbe: FileExistsProbe(_requestMonitorReadinessFile), Resources: kcore.ResourceRequirements{ - Requests: kcore.ResourceList{ - kcore.ResourceCPU: _requestMonitorCPURequest, - kcore.ResourceMemory: _requestMonitorMemRequest, - }, + Requests: requests, }, } } diff --git a/pkg/operator/resources/validations.go b/pkg/operator/resources/validations.go index 2bd6d9abc9..d71eb6bd61 100644 --- a/pkg/operator/resources/validations.go +++ b/pkg/operator/resources/validations.go @@ -168,6 +168,9 @@ var _cortexMemReserve = kresource.MustParse("1230Mi") var _nvidiaCPUReserve = kresource.MustParse("100m") var _nvidiaMemReserve = kresource.MustParse("100Mi") +var _nvidiaDCGMExporterCPUReserve = kresource.MustParse("50m") +var _nvidiaDCGMExporterMemReserve = kresource.MustParse("50Mi") + var _inferentiaCPUReserve = kresource.MustParse("100m") var _inferentiaMemReserve = kresource.MustParse("100Mi") @@ -187,6 +190,9 @@ func awsManagedValidateK8sCompute(compute *userconfig.Compute, maxMem kresource. // Reserve resources for nvidia device plugin daemonset maxCPU.Sub(_nvidiaCPUReserve) maxMem.Sub(_nvidiaMemReserve) + // Reserve resources for nvidia dcgm prometheus exporter + maxCPU.Sub(_nvidiaDCGMExporterCPUReserve) + maxMem.Sub(_nvidiaDCGMExporterMemReserve) } maxInf := instanceMetadata.Inf diff --git a/pkg/types/clusterconfig/cluster_config_aws.go b/pkg/types/clusterconfig/cluster_config_aws.go index 98f3460173..dae26b0f61 100644 --- a/pkg/types/clusterconfig/cluster_config_aws.go +++ b/pkg/types/clusterconfig/cluster_config_aws.go @@ -63,26 +63,28 @@ type CoreConfig struct { Namespace string `json:"namespace" yaml:"namespace"` IstioNamespace string `json:"istio_namespace" yaml:"istio_namespace"` - ImageOperator string `json:"image_operator" yaml:"image_operator"` - ImageManager string `json:"image_manager" yaml:"image_manager"` - ImageDownloader string `json:"image_downloader" yaml:"image_downloader"` - ImageRequestMonitor string `json:"image_request_monitor" yaml:"image_request_monitor"` - ImageClusterAutoscaler string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"` - ImageMetricsServer string `json:"image_metrics_server" yaml:"image_metrics_server"` - ImageInferentia string `json:"image_inferentia" yaml:"image_inferentia"` - ImageNeuronRTD string `json:"image_neuron_rtd" yaml:"image_neuron_rtd"` - ImageNvidia string `json:"image_nvidia" yaml:"image_nvidia"` - ImageFluentBit string `json:"image_fluent_bit" yaml:"image_fluent_bit"` - ImageIstioProxy string `json:"image_istio_proxy" yaml:"image_istio_proxy"` - ImageIstioPilot string `json:"image_istio_pilot" yaml:"image_istio_pilot"` - ImagePrometheus string `json:"image_prometheus" yaml:"image_prometheus"` - ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"` - ImagePrometheusOperator string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"` - ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"` - ImagePrometheusNodeExporter string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"` - ImageKubeRBACProxy string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"` - ImageGrafana string `json:"image_grafana" yaml:"image_grafana"` - ImageEventExporter string `json:"image_event_exporter" yaml:"image_event_exporter"` + ImageOperator string `json:"image_operator" yaml:"image_operator"` + ImageManager string `json:"image_manager" yaml:"image_manager"` + ImageDownloader string `json:"image_downloader" yaml:"image_downloader"` + ImageRequestMonitor string `json:"image_request_monitor" yaml:"image_request_monitor"` + ImageClusterAutoscaler string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"` + ImageMetricsServer string `json:"image_metrics_server" yaml:"image_metrics_server"` + ImageInferentia string `json:"image_inferentia" yaml:"image_inferentia"` + ImageNeuronRTD string `json:"image_neuron_rtd" yaml:"image_neuron_rtd"` + ImageNvidia string `json:"image_nvidia" yaml:"image_nvidia"` + ImageFluentBit string `json:"image_fluent_bit" yaml:"image_fluent_bit"` + ImageIstioProxy string `json:"image_istio_proxy" yaml:"image_istio_proxy"` + ImageIstioPilot string `json:"image_istio_pilot" yaml:"image_istio_pilot"` + ImagePrometheus string `json:"image_prometheus" yaml:"image_prometheus"` + ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"` + ImagePrometheusOperator string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"` + ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"` + ImagePrometheusDCGMExporter string `json:"image_prometheus_dcgm_exporter" yaml:"image_prometheus_dcgm_exporter"` + ImagePrometheusKubeStateMetrics string `json:"image_prometheus_kube_state_metrics" yaml:"image_prometheus_kube_state_metrics"` + ImagePrometheusNodeExporter string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"` + ImageKubeRBACProxy string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"` + ImageGrafana string `json:"image_grafana" yaml:"image_grafana"` + ImageEventExporter string `json:"image_event_exporter" yaml:"image_event_exporter"` } type ManagedConfig struct { @@ -333,6 +335,20 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{ Validator: validateImageVersion, }, }, + { + StructField: "ImagePrometheusDCGMExporter", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/prometheus-dcgm-exporter:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, + { + StructField: "ImagePrometheusKubeStateMetrics", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/prometheus-kube-state-metrics:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, { StructField: "ImagePrometheusNodeExporter", StringValidation: &cr.StringValidation{ @@ -1317,6 +1333,8 @@ func (cc *CoreConfig) UserTable() table.KeyValuePairs { items.Add(ImagePrometheusConfigReloaderUserKey, cc.ImagePrometheusConfigReloader) items.Add(ImagePrometheusOperatorUserKey, cc.ImagePrometheusOperator) items.Add(ImagePrometheusStatsDExporterUserKey, cc.ImagePrometheusStatsDExporter) + items.Add(ImagePrometheusDCGMExporterUserKey, cc.ImagePrometheusDCGMExporter) + items.Add(ImagePrometheusKubeStateMetricsUserKey, cc.ImagePrometheusKubeStateMetrics) items.Add(ImagePrometheusNodeExporterUserKey, cc.ImagePrometheusNodeExporter) items.Add(ImageKubeRBACProxyUserKey, cc.ImageKubeRBACProxy) items.Add(ImageGrafanaUserKey, cc.ImageGrafana) @@ -1452,6 +1470,12 @@ func (cc *CoreConfig) TelemetryEvent() map[string]interface{} { if strings.HasPrefix(cc.ImagePrometheusStatsDExporter, "cortexlabs/") { event["image_prometheus_statsd_exporter._is_custom"] = true } + if strings.HasPrefix(cc.ImagePrometheusDCGMExporter, "cortexlabs/") { + event["image_prometheus_dcgm_exporter._is_custom"] = true + } + if strings.HasPrefix(cc.ImagePrometheusKubeStateMetrics, "cortexlabs/") { + event["image_prometheus_kube_state_metrics._is_custom"] = true + } if strings.HasPrefix(cc.ImagePrometheusNodeExporter, "cortexlabs/") { event["image_prometheus_node_exporter._is_custom"] = true } diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index d0ef58c36c..fb10c50b7c 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -44,23 +44,25 @@ type GCPCoreConfig struct { IsManaged bool `json:"is_managed" yaml:"is_managed"` Bucket string `json:"bucket" yaml:"bucket"` - ImageOperator string `json:"image_operator" yaml:"image_operator"` - ImageManager string `json:"image_manager" yaml:"image_manager"` - ImageDownloader string `json:"image_downloader" yaml:"image_downloader"` - ImageRequestMonitor string `json:"image_request_monitor" yaml:"image_request_monitor"` - ImageClusterAutoscaler string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"` - ImageFluentBit string `json:"image_fluent_bit" yaml:"image_fluent_bit"` - ImageIstioProxy string `json:"image_istio_proxy" yaml:"image_istio_proxy"` - ImageIstioPilot string `json:"image_istio_pilot" yaml:"image_istio_pilot"` - ImageGooglePause string `json:"image_google_pause" yaml:"image_google_pause"` - ImagePrometheus string `json:"image_prometheus" yaml:"image_prometheus"` - ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"` - ImagePrometheusOperator string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"` - ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"` - ImagePrometheusNodeExporter string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"` - ImageKubeRBACProxy string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"` - ImageGrafana string `json:"image_grafana" yaml:"image_grafana"` - ImageEventExporter string `json:"image_event_exporter" yaml:"image_event_exporter"` + ImageOperator string `json:"image_operator" yaml:"image_operator"` + ImageManager string `json:"image_manager" yaml:"image_manager"` + ImageDownloader string `json:"image_downloader" yaml:"image_downloader"` + ImageRequestMonitor string `json:"image_request_monitor" yaml:"image_request_monitor"` + ImageClusterAutoscaler string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"` + ImageFluentBit string `json:"image_fluent_bit" yaml:"image_fluent_bit"` + ImageIstioProxy string `json:"image_istio_proxy" yaml:"image_istio_proxy"` + ImageIstioPilot string `json:"image_istio_pilot" yaml:"image_istio_pilot"` + ImageGooglePause string `json:"image_google_pause" yaml:"image_google_pause"` + ImagePrometheus string `json:"image_prometheus" yaml:"image_prometheus"` + ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"` + ImagePrometheusOperator string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"` + ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"` + ImagePrometheusDCGMExporter string `json:"image_prometheus_dcgm_exporter" yaml:"image_prometheus_dcgm_exporter"` + ImagePrometheusKubeStateMetrics string `json:"image_prometheus_kube_state_metrics" yaml:"image_prometheus_kube_state_metrics"` + ImagePrometheusNodeExporter string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"` + ImageKubeRBACProxy string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"` + ImageGrafana string `json:"image_grafana" yaml:"image_grafana"` + ImageEventExporter string `json:"image_event_exporter" yaml:"image_event_exporter"` } type GCPManagedConfig struct { @@ -255,6 +257,20 @@ var GCPCoreConfigStructFieldValidations = []*cr.StructFieldValidation{ Validator: validateImageVersion, }, }, + { + StructField: "ImagePrometheusDCGMExporter", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/prometheus-dcgm-exporter:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, + { + StructField: "ImagePrometheusKubeStateMetrics", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/prometheus-kube-state-metrics:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, { StructField: "ImageGrafana", StringValidation: &cr.StringValidation{ @@ -697,6 +713,8 @@ func (cc *GCPCoreConfig) UserTable() table.KeyValuePairs { items.Add(ImagePrometheusConfigReloaderUserKey, cc.ImagePrometheusConfigReloader) items.Add(ImagePrometheusOperatorUserKey, cc.ImagePrometheusOperator) items.Add(ImagePrometheusStatsDExporterUserKey, cc.ImagePrometheusStatsDExporter) + items.Add(ImagePrometheusDCGMExporterUserKey, cc.ImagePrometheusDCGMExporter) + items.Add(ImagePrometheusKubeStateMetricsUserKey, cc.ImagePrometheusKubeStateMetrics) items.Add(ImagePrometheusNodeExporterUserKey, cc.ImagePrometheusNodeExporter) items.Add(ImageKubeRBACProxyUserKey, cc.ImageKubeRBACProxy) items.Add(ImageGrafanaUserKey, cc.ImageGrafana) @@ -805,6 +823,12 @@ func (cc *GCPCoreConfig) TelemetryEvent() map[string]interface{} { if strings.HasPrefix(cc.ImagePrometheusStatsDExporter, "cortexlabs/") { event["image_prometheus_statsd_exporter._is_custom"] = true } + if strings.HasPrefix(cc.ImagePrometheusDCGMExporter, "cortexlabs/") { + event["image_prometheus_dcgm_exporter._is_custom"] = true + } + if strings.HasPrefix(cc.ImagePrometheusKubeStateMetrics, "cortexlabs/") { + event["image_prometheus_kube_state_metrics._is_custom"] = true + } if strings.HasPrefix(cc.ImagePrometheusNodeExporter, "cortexlabs/") { event["image_prometheus_node_exporter._is_custom"] = true } diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go index 6ca580b9b2..920b558b50 100644 --- a/pkg/types/clusterconfig/config_key.go +++ b/pkg/types/clusterconfig/config_key.go @@ -66,7 +66,6 @@ const ( ImageNeuronRTDKey = "image_neuron_rtd" ImageNvidiaKey = "image_nvidia" ImageFluentBitKey = "image_fluent_bit" - ImageStatsdKey = "image_statsd" ImageIstioProxyKey = "image_istio_proxy" ImageIstioPilotKey = "image_istio_pilot" ImageGooglePauseKey = "image_google_pause" @@ -74,6 +73,8 @@ const ( ImagePrometheusConfigReloaderKey = "image_prometheus_config_reloader" ImagePrometheusOperatorKey = "image_prometheus_operator" ImagePrometheusStatsDExporterKey = "image_prometheus_statsd_exporter" + ImagePrometheusDCGMExporterKey = "image_prometheus_dcgm_exporter" + ImagePrometheusKubeStateMetricsKey = "image_prometheus_kube_state_metrics" ImagePrometheusNodeExporterKey = "image_prometheus_node_exporter" ImageKubeRBACProxyKey = "image_kube_rbac_proxy" ImageGrafanaKey = "image_grafana" @@ -136,6 +137,8 @@ const ( ImagePrometheusConfigReloaderUserKey = "prometheus config reloader image" ImagePrometheusOperatorUserKey = "prometheus operator image" ImagePrometheusStatsDExporterUserKey = "prometheus statsd exporter image" + ImagePrometheusDCGMExporterUserKey = "prometheus dcgm exporter image" + ImagePrometheusKubeStateMetricsUserKey = "prometheus kube-state-metrics image" ImagePrometheusNodeExporterUserKey = "prometheus node exporter image" ImageKubeRBACProxyUserKey = "kube rbac proxy image" ImageGrafanaUserKey = "grafana image"